From 2769420ee4e93f0df75981c3642d2754f5f8acc6 Mon Sep 17 00:00:00 2001 From: Atchut Kumar Barli Date: Tue, 17 Mar 2026 13:47:53 -0700 Subject: [PATCH 01/15] prediag --- src/connectedk8s/HISTORY.rst | 1 + .../azext_connectedk8s/_constants.py | 1 + .../azext_connectedk8s/_precheckutils.py | 32 +++++++++++++++++++ src/connectedk8s/azext_connectedk8s/custom.py | 8 +++++ 4 files changed, 42 insertions(+) diff --git a/src/connectedk8s/HISTORY.rst b/src/connectedk8s/HISTORY.rst index b80684c1b19..7fdf7e99d61 100644 --- a/src/connectedk8s/HISTORY.rst +++ b/src/connectedk8s/HISTORY.rst @@ -5,6 +5,7 @@ Release History 1.10.12 +++++ * Removed deprecated '--app-id' and '--app-secret' RBAC parameters from the extension. +* Added telemetry for pre-onboarding diagnostic results, including diagnostics execution failures. 1.10.11 +++++++ diff --git a/src/connectedk8s/azext_connectedk8s/_constants.py b/src/connectedk8s/azext_connectedk8s/_constants.py index 4ec7bbb96ee..55109077e6a 100644 --- a/src/connectedk8s/azext_connectedk8s/_constants.py +++ b/src/connectedk8s/azext_connectedk8s/_constants.py @@ -124,6 +124,7 @@ PublicKey_Export_Fault_Type = "publickey-export-error" PrivateKey_Export_Fault_Type = "privatekey-export-error" Install_HelmRelease_Fault_Type = "helm-release-install-error" +Install_Prediagnostics_Fault_Type = "prediagnostics-execution-error" Delete_HelmRelease_Fault_Type = "helm-release-delete-error" Check_PodStatus_Fault_Type = "check-pod-status-error" Kubernetes_Connectivity_FaultType = "kubernetes-cluster-connection-error" diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index 231f2b4c659..b12a52e6f10 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -30,6 +30,38 @@ diagnoser_output: list[str] = [] +def send_pre_diagnostic_telemetry( + diagnostic_result: str, execution_status: str +) -> None: + """Send pre-diagnostic telemetry for diagnostics execution failures and check failures.""" + pre_diagnostic_details = " | ".join( + item.strip().replace("\n", " ") + for item in diagnoser_output + if item and item.strip() + ) + + prediagnostic_error_messages = ( + f"executionStatus={execution_status}; diagnosticResult={diagnostic_result}" + ) + if pre_diagnostic_details: + prediagnostic_error_messages += ( + "; details=" + + ( + pre_diagnostic_details[:1021] + "..." + if len(pre_diagnostic_details) > 1024 + else pre_diagnostic_details + ) + ) + + prediagnostic_error_detail = { + "Context.Default.AzureCLI.onboardingErrorType": consts.Install_Prediagnostics_Fault_Type, + "Context.Default.AzureCLI.onboardingErrorMessage": prediagnostic_error_messages, + } + + logger.warning(f"Sending pre-diagnostic telemetry: {prediagnostic_error_messages}") + telemetry.add_extension_event("connectedk8s", prediagnostic_error_detail) + + def fetch_diagnostic_checks_results( cmd: CLICommand, corev1_api_instance: CoreV1Api, diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index d0e399bcbb2..774109235e1 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -374,6 +374,10 @@ def create_connectedk8s( ) except Exception as e: + precheckutils.send_pre_diagnostic_telemetry( + diagnostic_result=diagnostic_checks, + execution_status="ExecutionFailed", + ) ex_msg = f"An exception occured while trying to execute pre-onboarding diagnostic checks : {e}" summ_msg = f"An exception occured while trying to execute pre-onboarding diagnostic checks : {e}" telemetry.set_exception( @@ -401,6 +405,10 @@ def create_connectedk8s( and not azure_local_disconnected and not lowbandwidth ): + precheckutils.send_pre_diagnostic_telemetry( + diagnostic_result=diagnostic_checks, + execution_status="Completed", + ) if storage_space_available: logger.warning( "The pre-check result logs logs have been saved at this path: " From a3173c925cf6b3ff2dad2f4da81c35561eca547f Mon Sep 17 00:00:00 2001 From: Atchut Kumar Barli Date: Tue, 31 Mar 2026 16:03:06 -0700 Subject: [PATCH 02/15] messagefix --- .../azext_connectedk8s/_constants.py | 3 +- .../azext_connectedk8s/_precheckutils.py | 122 +++++++++++++----- src/connectedk8s/azext_connectedk8s/custom.py | 17 ++- 3 files changed, 103 insertions(+), 39 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_constants.py b/src/connectedk8s/azext_connectedk8s/_constants.py index 55109077e6a..31f0a696cff 100644 --- a/src/connectedk8s/azext_connectedk8s/_constants.py +++ b/src/connectedk8s/azext_connectedk8s/_constants.py @@ -124,7 +124,8 @@ PublicKey_Export_Fault_Type = "publickey-export-error" PrivateKey_Export_Fault_Type = "privatekey-export-error" Install_HelmRelease_Fault_Type = "helm-release-install-error" -Install_Prediagnostics_Fault_Type = "prediagnostics-execution-error" +Install_Prediagnostics_Fault_Type = "prediagnostics-failure" +Install_Prediagnostics_Job_Execution_Error_Fault_Type = "prediagnostics-job-execution-error" Delete_HelmRelease_Fault_Type = "helm-release-delete-error" Check_PodStatus_Fault_Type = "check-pod-status-error" Kubernetes_Connectivity_FaultType = "kubernetes-cluster-connection-error" diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index b12a52e6f10..1ffea15483f 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -28,40 +28,73 @@ # pylint: disable diagnoser_output: list[str] = [] +prediagnostic_job_execution_status = "NotStarted" +prediagnostic_dns_check = "Starting" +prediagnostic_outbound_check = "Starting" -def send_pre_diagnostic_telemetry( - diagnostic_result: str, execution_status: str -) -> None: - """Send pre-diagnostic telemetry for diagnostics execution failures and check failures.""" - pre_diagnostic_details = " | ".join( - item.strip().replace("\n", " ") - for item in diagnoser_output - if item and item.strip() - ) +def send_prediagnostic_job_execution_error_telemetry(reason: str = "") -> None: + """Send telemetry when prediagnostic job execution fails.""" + error_message = f"jobExecutionStatus={prediagnostic_job_execution_status}" + if reason: + error_message += f"; reason={reason}" - prediagnostic_error_messages = ( - f"executionStatus={execution_status}; diagnosticResult={diagnostic_result}" - ) - if pre_diagnostic_details: - prediagnostic_error_messages += ( - "; details=" - + ( - pre_diagnostic_details[:1021] + "..." - if len(pre_diagnostic_details) > 1024 - else pre_diagnostic_details - ) - ) + prediagnostic_error_detail = { + "Context.Default.AzureCLI.onboardingErrorType": consts.Install_Prediagnostics_Job_Execution_Error_Fault_Type, + "Context.Default.AzureCLI.onboardingErrorMessage": error_message, + } + + logger.warning(f"Sending prediagnostic job execution error telemetry: {error_message}") + telemetry.add_extension_event("connectedk8s", prediagnostic_error_detail) + + +def send_prediagnostic_check_failure_telemetry( + dns_check: str, outbound_connectivity_check: str +) -> None: + """Send telemetry when prediagnostic checks fail (job completed but checks did not pass).""" + import json + + # Extract error messages from diagnoser_output + dns_error = "" + outbound_error = "" + for msg in diagnoser_output: + msg_lower = msg.lower() + # Capture DNS-specific errors + if dns_check == "Failed" and "dns" in msg_lower and "error" in msg_lower: + dns_error = msg.strip() + # Capture outbound connectivity errors + if outbound_connectivity_check == "Failed" and "outbound" in msg_lower and "error" in msg_lower: + outbound_error = msg.strip() + + check_results = { + "dnsCheck": dns_check, + "outboundConnectivityCheck": outbound_connectivity_check, + } + + # Only add error details if checks actually failed + if dns_error: + check_results["dnsError"] = dns_error + if outbound_error: + check_results["outboundError"] = outbound_error + + error_message = json.dumps(check_results) prediagnostic_error_detail = { "Context.Default.AzureCLI.onboardingErrorType": consts.Install_Prediagnostics_Fault_Type, - "Context.Default.AzureCLI.onboardingErrorMessage": prediagnostic_error_messages, + "Context.Default.AzureCLI.onboardingErrorMessage": error_message, } - logger.warning(f"Sending pre-diagnostic telemetry: {prediagnostic_error_messages}") + logger.warning(f"Sending prediagnostic check failure telemetry: {error_message}") telemetry.add_extension_event("connectedk8s", prediagnostic_error_detail) +def get_precheck_failure_summary() -> str: + for output in reversed(diagnoser_output): + if output.startswith("Precheck summary:"): + return output + return "" + + def fetch_diagnostic_checks_results( cmd: CLICommand, corev1_api_instance: CoreV1Api, @@ -79,10 +112,12 @@ def fetch_diagnostic_checks_results( filepath_with_timestamp: str, storage_space_available: bool, ) -> tuple[str, bool]: + global prediagnostic_job_execution_status, prediagnostic_dns_check, prediagnostic_outbound_check try: - # Setting DNS and Outbound Check as working - dns_check = "Starting" - outbound_connectivity_check = "Starting" + diagnoser_output.clear() + prediagnostic_job_execution_status = "NotStarted" + prediagnostic_dns_check = "Starting" + prediagnostic_outbound_check = "Starting" # Executing the cluster_diagnostic_checks job and fetching the logs obtained cluster_diagnostic_checks_container_log = ( executing_cluster_diagnostic_checks_job( @@ -104,10 +139,15 @@ def fetch_diagnostic_checks_results( ) ) # If cluster_diagnostic_checks_container_log is not empty there were errors. Try to read the logs. - if ( - cluster_diagnostic_checks_container_log is not None - and cluster_diagnostic_checks_container_log != "" - ): + if cluster_diagnostic_checks_container_log is None: + diagnoser_output.append( + "Precheck summary: " + f"jobExecutionStatus={prediagnostic_job_execution_status}; " + f"dnsCheck={dns_check}; outboundConnectivityCheck={outbound_connectivity_check}" + ) + return consts.Diagnostic_Check_Incomplete, storage_space_available + + if cluster_diagnostic_checks_container_log != "": cluster_diagnostic_checks_container_log_list = ( cluster_diagnostic_checks_container_log.split("\n") ) @@ -134,6 +174,7 @@ def fetch_diagnostic_checks_results( storage_space_available, diagnoser_output, ) + prediagnostic_dns_check = dns_check outbound_connectivity_check, storage_space_available = ( azext_utils.check_cluster_outbound_connectivity( outbound_connectivity_check_log, @@ -142,6 +183,7 @@ def fetch_diagnostic_checks_results( diagnoser_output, ) ) + prediagnostic_outbound_check = outbound_connectivity_check else: return consts.Diagnostic_Check_Passed, storage_space_available @@ -150,8 +192,18 @@ def fetch_diagnostic_checks_results( dns_check == consts.Diagnostic_Check_Incomplete or outbound_connectivity_check == consts.Diagnostic_Check_Incomplete ): + diagnoser_output.append( + "Precheck summary: " + f"jobExecutionStatus={prediagnostic_job_execution_status}; " + f"dnsCheck={dns_check}; outboundConnectivityCheck={outbound_connectivity_check}" + ) return consts.Diagnostic_Check_Incomplete, storage_space_available + diagnoser_output.append( + "Precheck summary: " + f"jobExecutionStatus={prediagnostic_job_execution_status}; " + f"dnsCheck={dns_check}; outboundConnectivityCheck={outbound_connectivity_check}" + ) return consts.Diagnostic_Check_Failed, storage_space_available # To handle any exception that may occur during the execution @@ -186,6 +238,7 @@ def executing_cluster_diagnostic_checks_job( filepath_with_timestamp: str, storage_space_available: bool, ) -> str | None: + global prediagnostic_job_execution_status job_name = "cluster-diagnostic-checks-job" # Setting the log output as Empty cluster_diagnostic_checks_container_log = "" @@ -206,6 +259,7 @@ def executing_cluster_diagnostic_checks_job( # To handle the user keyboard Interrupt try: + prediagnostic_job_execution_status = "Running" # Executing the Cluster Diagnostic Checks Job yaml config.load_kube_config(kube_config, kube_context) # checking existence of the release and if present we delete the stale release @@ -232,6 +286,7 @@ def executing_cluster_diagnostic_checks_job( exception_occured_counter = 1 # If any exception occured we will print the exception and return if exception_occured_counter == 1: + prediagnostic_job_execution_status = "CleanupFailed" logger.warning( "Cleanup of previous diagnostic checks helm release failed and hence couldn't " 'install the new helm release. Please cleanup older release using "helm delete ' @@ -339,6 +394,7 @@ def executing_cluster_diagnostic_checks_job( # If job is not scheduled then we will delete the helm release if is_job_scheduled is False: + prediagnostic_job_execution_status = "NotScheduled" telemetry.set_exception( exception="Couldn't schedule Cluster Diagnostic Checks Job in the cluster", fault_type=consts.Cluster_Diagnostic_Checks_Job_Not_Scheduled, @@ -356,6 +412,7 @@ def executing_cluster_diagnostic_checks_job( return None if is_job_complete is False: + prediagnostic_job_execution_status = "NotCompleted" # Job was scheduled successfully, but didn't complete. We will fetch the logs and delete helm release. logger.debug( "Cluster Diagnostic Checks Job Failed. Fetch results and delete Helm release in the cluster" @@ -435,9 +492,12 @@ def executing_cluster_diagnostic_checks_job( # To handle any exception that may occur during the execution except Exception as e: + prediagnostic_job_execution_status = "ExecutionFailed" Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) raise CLIInternalError(f"Failed to execute Cluster Diagnostic Checks Job: {e}") - + if is_job_complete: + prediagnostic_job_execution_status = "Completed" + logger.debug(cluster_diagnostic_checks_container_log) #atchub delete return cluster_diagnostic_checks_container_log diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index 774109235e1..da39facbf2b 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -374,10 +374,7 @@ def create_connectedk8s( ) except Exception as e: - precheckutils.send_pre_diagnostic_telemetry( - diagnostic_result=diagnostic_checks, - execution_status="ExecutionFailed", - ) + precheckutils.send_prediagnostic_job_execution_error_telemetry(reason=str(e)) ex_msg = f"An exception occured while trying to execute pre-onboarding diagnostic checks : {e}" summ_msg = f"An exception occured while trying to execute pre-onboarding diagnostic checks : {e}" telemetry.set_exception( @@ -405,9 +402,13 @@ def create_connectedk8s( and not azure_local_disconnected and not lowbandwidth ): - precheckutils.send_pre_diagnostic_telemetry( - diagnostic_result=diagnostic_checks, - execution_status="Completed", + precheck_failure_summary = precheckutils.get_precheck_failure_summary() + precheck_failure_summary_msg = ( + f" Details: {precheck_failure_summary}" if precheck_failure_summary else "" + ) + precheckutils.send_prediagnostic_check_failure_telemetry( + precheckutils.prediagnostic_dns_check, + precheckutils.prediagnostic_outbound_check, ) if storage_space_available: logger.warning( @@ -426,6 +427,7 @@ def create_connectedk8s( "meet the prerequisites - " + consts.Doc_Onboarding_PreRequisites_Url + " and try onboarding again." + + precheck_failure_summary_msg ) raise ValidationError(err_msg) @@ -438,6 +440,7 @@ def create_connectedk8s( err_msg = ( "One or more pre-onboarding diagnostic checks failed and hence not proceeding with " "cluster onboarding. Please resolve them and try onboarding again." + + precheck_failure_summary_msg ) raise ValidationError(err_msg) From 004b1109ae68aac553a121a54f96cbe374516cd4 Mon Sep 17 00:00:00 2001 From: Atchut Kumar Barli Date: Wed, 1 Apr 2026 09:39:59 -0700 Subject: [PATCH 03/15] logprediagerrors --- .../azext_connectedk8s/_constants.py | 3 + .../azext_connectedk8s/_precheckutils.py | 120 ++++++++++++++---- src/connectedk8s/azext_connectedk8s/_utils.py | 3 + src/connectedk8s/azext_connectedk8s/custom.py | 25 +++- 4 files changed, 125 insertions(+), 26 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_constants.py b/src/connectedk8s/azext_connectedk8s/_constants.py index 31f0a696cff..fe6cd584f90 100644 --- a/src/connectedk8s/azext_connectedk8s/_constants.py +++ b/src/connectedk8s/azext_connectedk8s/_constants.py @@ -126,6 +126,7 @@ Install_HelmRelease_Fault_Type = "helm-release-install-error" Install_Prediagnostics_Fault_Type = "prediagnostics-failure" Install_Prediagnostics_Job_Execution_Error_Fault_Type = "prediagnostics-job-execution-error" +Post_Diagnostic_Precheck_Fault_Type = "post-diagnostic-precheck-failure" Delete_HelmRelease_Fault_Type = "helm-release-delete-error" Check_PodStatus_Fault_Type = "check-pod-status-error" Kubernetes_Connectivity_FaultType = "kubernetes-cluster-connection-error" @@ -477,6 +478,8 @@ "Outbound network connectivity check failed for Cluster Connect" ) DNS_Check_Result_String = "DNS Result:" +Entra_Connectivity_Check_Result_String = "Entra Authentication Endpoint Connectivity Check Result" +CRD_Ownership_Check_Failed_String = "Check Failed: CRD" AZ_CLI_ADAL_TO_MSAL_MIGRATE_VERSION = "2.30.0" CLIENT_PROXY_VERSION = "1.3.033281" CLIENT_PROXY_FOLDER = ".clientproxy" diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index 1ffea15483f..a9848bde7a8 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -4,6 +4,7 @@ # -------------------------------------------------------------------------------------------- from __future__ import annotations +import json import os import shutil from subprocess import PIPE, Popen @@ -31,20 +32,22 @@ prediagnostic_job_execution_status = "NotStarted" prediagnostic_dns_check = "Starting" prediagnostic_outbound_check = "Starting" +prediagnostic_entra_check = "Starting" +prediagnostic_crd_check = "Starting" def send_prediagnostic_job_execution_error_telemetry(reason: str = "") -> None: """Send telemetry when prediagnostic job execution fails.""" - error_message = f"jobExecutionStatus={prediagnostic_job_execution_status}" + error_detail_msg = {"jobExecutionStatus": prediagnostic_job_execution_status} if reason: - error_message += f"; reason={reason}" + error_detail_msg["reason"] = reason + error_message = json.dumps(error_detail_msg) prediagnostic_error_detail = { "Context.Default.AzureCLI.onboardingErrorType": consts.Install_Prediagnostics_Job_Execution_Error_Fault_Type, "Context.Default.AzureCLI.onboardingErrorMessage": error_message, } - - logger.warning(f"Sending prediagnostic job execution error telemetry: {error_message}") + print(f"[Telemetry] onboardingErrorType={consts.Install_Prediagnostics_Job_Execution_Error_Fault_Type} onboardingErrorMessage={error_message}") telemetry.add_extension_event("connectedk8s", prediagnostic_error_detail) @@ -52,11 +55,11 @@ def send_prediagnostic_check_failure_telemetry( dns_check: str, outbound_connectivity_check: str ) -> None: """Send telemetry when prediagnostic checks fail (job completed but checks did not pass).""" - import json - # Extract error messages from diagnoser_output dns_error = "" outbound_error = "" + entra_error = "" + crd_error = "" for msg in diagnoser_output: msg_lower = msg.lower() # Capture DNS-specific errors @@ -65,10 +68,16 @@ def send_prediagnostic_check_failure_telemetry( # Capture outbound connectivity errors if outbound_connectivity_check == "Failed" and "outbound" in msg_lower and "error" in msg_lower: outbound_error = msg.strip() + if prediagnostic_entra_check == "Failed" and "entra" in msg_lower: + entra_error = msg.strip() + if prediagnostic_crd_check == "Failed" and "crd" in msg_lower: + crd_error = msg.strip() check_results = { "dnsCheck": dns_check, "outboundConnectivityCheck": outbound_connectivity_check, + "entraCheck": prediagnostic_entra_check, + "crdCheck": prediagnostic_crd_check, } # Only add error details if checks actually failed @@ -76,6 +85,10 @@ def send_prediagnostic_check_failure_telemetry( check_results["dnsError"] = dns_error if outbound_error: check_results["outboundError"] = outbound_error + if entra_error: + check_results["entraError"] = entra_error + if crd_error: + check_results["crdError"] = crd_error error_message = json.dumps(check_results) @@ -84,10 +97,21 @@ def send_prediagnostic_check_failure_telemetry( "Context.Default.AzureCLI.onboardingErrorMessage": error_message, } - logger.warning(f"Sending prediagnostic check failure telemetry: {error_message}") + print(f"[Telemetry] onboardingErrorType={consts.Install_Prediagnostics_Fault_Type} onboardingErrorMessage={error_message}") telemetry.add_extension_event("connectedk8s", prediagnostic_error_detail) +def send_post_diagnostic_precheck_failure_telemetry(check_name: str, reason: str) -> None: + """Send telemetry for individual precheck failures that occur after the diagnostic job.""" + error_message = json.dumps({"checkName": check_name, "reason": reason}) + error_detail = { + "Context.Default.AzureCLI.onboardingErrorType": consts.Post_Diagnostic_Precheck_Fault_Type, + "Context.Default.AzureCLI.onboardingErrorMessage": error_message, + } + print(f"[Telemetry] onboardingErrorType={consts.Post_Diagnostic_Precheck_Fault_Type} onboardingErrorMessage={error_message}") + telemetry.add_extension_event("connectedk8s", error_detail) + + def get_precheck_failure_summary() -> str: for output in reversed(diagnoser_output): if output.startswith("Precheck summary:"): @@ -112,12 +136,14 @@ def fetch_diagnostic_checks_results( filepath_with_timestamp: str, storage_space_available: bool, ) -> tuple[str, bool]: - global prediagnostic_job_execution_status, prediagnostic_dns_check, prediagnostic_outbound_check + global prediagnostic_job_execution_status, prediagnostic_dns_check, prediagnostic_outbound_check, prediagnostic_entra_check, prediagnostic_crd_check try: diagnoser_output.clear() prediagnostic_job_execution_status = "NotStarted" prediagnostic_dns_check = "Starting" prediagnostic_outbound_check = "Starting" + prediagnostic_entra_check = "Starting" + prediagnostic_crd_check = "Starting" # Executing the cluster_diagnostic_checks job and fetching the logs obtained cluster_diagnostic_checks_container_log = ( executing_cluster_diagnostic_checks_job( @@ -143,8 +169,10 @@ def fetch_diagnostic_checks_results( diagnoser_output.append( "Precheck summary: " f"jobExecutionStatus={prediagnostic_job_execution_status}; " - f"dnsCheck={dns_check}; outboundConnectivityCheck={outbound_connectivity_check}" + f"dnsCheck={prediagnostic_dns_check}; outboundConnectivityCheck={prediagnostic_outbound_check}; " + f"entraCheck={prediagnostic_entra_check}; crdCheck={prediagnostic_crd_check}" ) + send_prediagnostic_job_execution_error_telemetry() return consts.Diagnostic_Check_Incomplete, storage_space_available if cluster_diagnostic_checks_container_log != "": @@ -154,6 +182,8 @@ def fetch_diagnostic_checks_results( cluster_diagnostic_checks_container_log_list.pop(-1) dns_check_log = "" outbound_connectivity_check_log = "" + entra_check_log = "" + crd_check_log = "" counter_container_logs = 1 # For retrieving only cluster_diagnostic_checks logs from the output for outputs in cluster_diagnostic_checks_container_log_list: @@ -163,6 +193,12 @@ def fetch_diagnostic_checks_results( outbound_connectivity_check_log += outputs else: outbound_connectivity_check_log += " " + outputs + elif consts.Entra_Connectivity_Check_Result_String in outputs: + entra_check_log = outputs + counter_container_logs = 1 + elif consts.CRD_Ownership_Check_Failed_String in outputs: + crd_check_log += outputs + "\n" + counter_container_logs = 1 elif consts.DNS_Check_Result_String in outputs: dns_check_log += outputs counter_container_logs = 0 @@ -184,27 +220,66 @@ def fetch_diagnostic_checks_results( ) ) prediagnostic_outbound_check = outbound_connectivity_check + + # Parse Entra check result + # If no Entra result line found, the helm chart version may not support it — treat as NotApplicable (skip) + if entra_check_log: + # Format: "Entra Authentication Endpoint Connectivity Check Result : : " + parts = entra_check_log.strip().split(" : ") + if len(parts) >= 3: + entra_response_code = parts[-1].strip() + if entra_response_code in ("200", "404"): + prediagnostic_entra_check = consts.Diagnostic_Check_Passed + else: + prediagnostic_entra_check = consts.Diagnostic_Check_Failed + diagnoser_output.append( + f"Error: Entra authentication endpoint connectivity check failed. " + f"Response code: {entra_response_code}. " + "Please ensure outbound connectivity to the Entra (Azure AD) authentication endpoint.\n" + ) + else: + prediagnostic_entra_check = consts.Diagnostic_Check_Incomplete + else: + # Entra check not present in logs — older helm chart version, not applicable + prediagnostic_entra_check = "NotApplicable" + + # Parse CRD ownership check result + if crd_check_log: + prediagnostic_crd_check = consts.Diagnostic_Check_Failed + diagnoser_output.append( + f"Error: CRD ownership validation failed.\n{crd_check_log.strip()}" + ) + else: + prediagnostic_crd_check = consts.Diagnostic_Check_Passed else: return consts.Diagnostic_Check_Passed, storage_space_available - # If any of the check remain Incomplete than we will return Incomplete + diagnoser_output.append( + "Precheck summary: " + f"jobExecutionStatus={prediagnostic_job_execution_status}; " + f"dnsCheck={dns_check}; outboundConnectivityCheck={outbound_connectivity_check}; " + f"entraCheck={prediagnostic_entra_check}; crdCheck={prediagnostic_crd_check}" + ) + + # Return Incomplete if any mandatory check couldn't be determined if ( dns_check == consts.Diagnostic_Check_Incomplete or outbound_connectivity_check == consts.Diagnostic_Check_Incomplete + or prediagnostic_entra_check == consts.Diagnostic_Check_Incomplete ): - diagnoser_output.append( - "Precheck summary: " - f"jobExecutionStatus={prediagnostic_job_execution_status}; " - f"dnsCheck={dns_check}; outboundConnectivityCheck={outbound_connectivity_check}" - ) return consts.Diagnostic_Check_Incomplete, storage_space_available - diagnoser_output.append( - "Precheck summary: " - f"jobExecutionStatus={prediagnostic_job_execution_status}; " - f"dnsCheck={dns_check}; outboundConnectivityCheck={outbound_connectivity_check}" - ) - return consts.Diagnostic_Check_Failed, storage_space_available + # Return Failed only if at least one check actually failed + if ( + dns_check == consts.Diagnostic_Check_Failed + or outbound_connectivity_check == consts.Diagnostic_Check_Failed + or prediagnostic_entra_check == consts.Diagnostic_Check_Failed + or prediagnostic_crd_check == consts.Diagnostic_Check_Failed + ): + return consts.Diagnostic_Check_Failed, storage_space_available + + # All checks passed or not applicable + return consts.Diagnostic_Check_Passed, storage_space_available # To handle any exception that may occur during the execution except Exception as e: @@ -212,6 +287,7 @@ def fetch_diagnostic_checks_results( "An exception has occured while trying to execute cluster diagnostic checks " "container on the cluster." ) + send_prediagnostic_job_execution_error_telemetry(reason=str(e)) telemetry.set_exception( exception=e, fault_type=consts.Cluster_Diagnostic_Checks_Execution_Failed_Fault_Type, @@ -497,7 +573,7 @@ def executing_cluster_diagnostic_checks_job( raise CLIInternalError(f"Failed to execute Cluster Diagnostic Checks Job: {e}") if is_job_complete: prediagnostic_job_execution_status = "Completed" - logger.debug(cluster_diagnostic_checks_container_log) #atchub delete + logger.debug(cluster_diagnostic_checks_container_log) return cluster_diagnostic_checks_container_log diff --git a/src/connectedk8s/azext_connectedk8s/_utils.py b/src/connectedk8s/azext_connectedk8s/_utils.py index 15547848469..9f1ac6be802 100644 --- a/src/connectedk8s/azext_connectedk8s/_utils.py +++ b/src/connectedk8s/azext_connectedk8s/_utils.py @@ -383,6 +383,9 @@ def check_cluster_DNS( if ( "NXDOMAIN" in formatted_dns_log or "connection timed out" in formatted_dns_log + or "no servers could be reached" in formatted_dns_log + or "communications error" in formatted_dns_log + or "timed out" in formatted_dns_log ): logger.warning( "Error: We found an issue with the DNS resolution on your cluster. For details about debugging DNS " diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index da39facbf2b..e2cd6e8f0cc 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -367,6 +367,12 @@ def create_connectedk8s( filepath_with_timestamp, storage_space_available, 1 ) + if precheckutils.diagnoser_output: + print("\n--- Pre-onboarding Diagnostic Check Results ---") + for line in precheckutils.diagnoser_output: + print(line.rstrip()) + print("--- End of Diagnostic Check Results ---\n") + if storage_space_available is False: logger.warning( "There is no storage space available on your device and hence not saving cluster " @@ -406,10 +412,13 @@ def create_connectedk8s( precheck_failure_summary_msg = ( f" Details: {precheck_failure_summary}" if precheck_failure_summary else "" ) - precheckutils.send_prediagnostic_check_failure_telemetry( - precheckutils.prediagnostic_dns_check, - precheckutils.prediagnostic_outbound_check, - ) + if precheckutils.prediagnostic_job_execution_status in ("Completed", "NotCompleted"): + precheckutils.send_prediagnostic_check_failure_telemetry( + precheckutils.prediagnostic_dns_check, + precheckutils.prediagnostic_outbound_check, + ) + else: + precheckutils.send_prediagnostic_job_execution_error_telemetry() if storage_space_available: logger.warning( "The pre-check result logs logs have been saved at this path: " @@ -461,6 +470,10 @@ def create_connectedk8s( fault_type=consts.Linux_Node_Not_Exists, summary="Couldn't find any node on the kubernetes cluster with the OS 'linux'", ) + precheckutils.send_post_diagnostic_precheck_failure_telemetry( + check_name="LinuxNodeExists", + reason="Couldn't find any node on the kubernetes cluster with the OS 'linux'", + ) logger.warning( "Please ensure that this Kubernetes cluster has any nodes with OS 'linux', for scheduling the " "Arc-Agents onto and connecting to Azure. Learn more at %s", @@ -479,6 +492,10 @@ def create_connectedk8s( fault_type=consts.Cannot_Create_ClusterRoleBindings_Fault_Type, summary=summ_msg, ) + precheckutils.send_post_diagnostic_precheck_failure_telemetry( + check_name="ClusterRoleBindings", + reason=ex_msg, + ) err_msg = ( "Your credentials doesn't have permission to create clusterrolebindings on this " "kubernetes cluster. Please check your permissions." From 3f27457733a2226193e4941c9616935a88b94bfb Mon Sep 17 00:00:00 2001 From: Atchut Kumar Barli Date: Wed, 1 Apr 2026 09:53:09 -0700 Subject: [PATCH 04/15] historyupdate --- src/connectedk8s/HISTORY.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/connectedk8s/HISTORY.rst b/src/connectedk8s/HISTORY.rst index 7fdf7e99d61..8adc0830400 100644 --- a/src/connectedk8s/HISTORY.rst +++ b/src/connectedk8s/HISTORY.rst @@ -2,6 +2,10 @@ Release History =============== +1.10.13 ++++++ +* Added telemetry for pre-onboarding diagnostic results, including diagnostics execution failures. + 1.10.12 +++++ * Removed deprecated '--app-id' and '--app-secret' RBAC parameters from the extension. From 432bc14f699d5424cfdf60040dffc783aad1bc4f Mon Sep 17 00:00:00 2001 From: Atchut Kumar Barli Date: Wed, 1 Apr 2026 17:24:54 -0700 Subject: [PATCH 05/15] duplicatestring --- src/connectedk8s/HISTORY.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/src/connectedk8s/HISTORY.rst b/src/connectedk8s/HISTORY.rst index 8adc0830400..0b83feb5efa 100644 --- a/src/connectedk8s/HISTORY.rst +++ b/src/connectedk8s/HISTORY.rst @@ -9,7 +9,6 @@ Release History 1.10.12 +++++ * Removed deprecated '--app-id' and '--app-secret' RBAC parameters from the extension. -* Added telemetry for pre-onboarding diagnostic results, including diagnostics execution failures. 1.10.11 +++++++ From 1c2a7e6c4ef7983b9e92dda661bf7279bd98bf8b Mon Sep 17 00:00:00 2001 From: Atchut Kumar Barli Date: Fri, 10 Apr 2026 12:44:53 -0700 Subject: [PATCH 06/15] test --- src/connectedk8s/policy.yam; | 16 ++ src/connectedk8s/test.json | 14 ++ testing/test_prediagnostic_telemetry.ps1 | 265 ++++++++++++++++++++++ testing/test_prediagnostic_telemetry.sh | 277 +++++++++++++++++++++++ 4 files changed, 572 insertions(+) create mode 100644 src/connectedk8s/policy.yam; create mode 100644 src/connectedk8s/test.json create mode 100644 testing/test_prediagnostic_telemetry.ps1 create mode 100644 testing/test_prediagnostic_telemetry.sh diff --git a/src/connectedk8s/policy.yam; b/src/connectedk8s/policy.yam; new file mode 100644 index 00000000000..ed5b3ccda76 --- /dev/null +++ b/src/connectedk8s/policy.yam; @@ -0,0 +1,16 @@ +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: block-dns + namespace: azure-arc-release +spec: + podSelector: {} # applies to all pods in namespace + policyTypes: + - Egress + egress: + - ports: + - port: 443 # allow HTTPS so only DNS is broken, not everything + protocol: TCP + - port: 80 + protocol: TCP + # port 53 (UDP+TCP) is NOT listed → blocked \ No newline at end of file diff --git a/src/connectedk8s/test.json b/src/connectedk8s/test.json new file mode 100644 index 00000000000..89fd221c196 --- /dev/null +++ b/src/connectedk8s/test.json @@ -0,0 +1,14 @@ +{'Context.Default.AzureCLI.onboardingErrorType': 'prediagnostics-job-execution-error', +'Context.Default.AzureCLI.onboardingErrorMessage': + 'jobExecutionStatus=ExecutionFailed; reason=Failed to execute Cluster Diagnostic Checks Job: (400)\nReason: Bad Request\nHTTP response headers: HTTPHeaderDict({\'Audit-Id\': \'5a3b0ffc-444f-4b05-b406-b48889d1c38c\', \'Cache-Control\': \'no-cache, private\', \'Content-Type\': \'application/json\', \'Date\': \'Mon, + 23 Mar 2026 02: 16: 00 GMT\', \'Content-Length\': \'250\' + })\nHTTP response body: { + "kind": "Status", + "apiVersion": "v1", + "metadata": {}, + "status": "Failure", + "message": "container \\"cluster-diagnostic-checks-container\\" in pod \\"cluster-diagnostic-checks-job-vtg9d\\" is waiting to start: ContainerCreating", + "reason": "BadRequest", + "code": 400 + }\n\n', 'Context.Default.AzureCLI.ExtensionName': 'connectedk8s', 'Reserved.DataModel.CorrelationId': 'ed794940-25ba-43bf-86bf-7a9ce8d0fd58' +} \ No newline at end of file diff --git a/testing/test_prediagnostic_telemetry.ps1 b/testing/test_prediagnostic_telemetry.ps1 new file mode 100644 index 00000000000..0a30b11a49b --- /dev/null +++ b/testing/test_prediagnostic_telemetry.ps1 @@ -0,0 +1,265 @@ +# test_prediagnostic_telemetry.ps1 +# Exercises all prediagnostic failing scenarios and verifies az connectedk8s connect fails. +# Usage: .\test_prediagnostic_telemetry.ps1 +# Prerequisites: kubectl configured, az cli with connectedk8s extension from source (env1), kubeconfig set. + +param( + [string]$ResourceGroup = "audittest", + [string]$Location = "eastus2euap" +) + +# ── Helpers ────────────────────────────────────────────────────────────────── + +$PASS = "[PASS]" +$FAIL = "[FAIL]" +$INFO = "[INFO]" + +function Log-Info { param($msg) Write-Host "$INFO $msg" -ForegroundColor Cyan } +function Log-Pass { param($msg) Write-Host "$PASS $msg" -ForegroundColor Green } +function Log-Fail { param($msg) Write-Host "$FAIL $msg" -ForegroundColor Red } +function Log-Sep { Write-Host ("`n" + "─" * 70) -ForegroundColor DarkGray } + +$OriginalCorefile = $null + +function Save-CoreDNS { + $script:OriginalCorefile = kubectl get configmap coredns -n kube-system -o jsonpath='{.data.Corefile}' 2>&1 + Log-Info "CoreDNS original config saved." +} + +function Restore-CoreDNS { + if (-not $script:OriginalCorefile) { return } + Log-Info "Restoring CoreDNS..." + $patch = @{data = @{Corefile = $script:OriginalCorefile}} | ConvertTo-Json -Compress -Depth 5 + kubectl patch configmap coredns -n kube-system --type merge -p $patch | Out-Null + kubectl rollout restart deployment/coredns -n kube-system | Out-Null + kubectl rollout status deployment/coredns -n kube-system --timeout=60s | Out-Null + Log-Info "CoreDNS restored." +} + +function Apply-CoreDNS-Block { + param([string[]]$Hosts) + $hostsBlock = ($Hosts | ForEach-Object { " 192.0.2.1 $_" }) -join "`n" + $newCorefile = @" +.:53 { + errors + ready + health { + lameduck 5s + } + hosts { +$hostsBlock + fallthrough + } + kubernetes cluster.local in-addr.arpa ip6.arpa { + pods insecure + fallthrough in-addr.arpa ip6.arpa + ttl 30 + } + prometheus :9153 + forward . /etc/resolv.conf + cache 30 + loop + reload + loadbalance + import custom/*.override + template ANY ANY internal.cloudapp.net { + match "^(?:[^.]+\.){4,}internal\.cloudapp\.net\.$" + rcode NXDOMAIN + fallthrough + } + template ANY ANY reddog.microsoft.com { + rcode NXDOMAIN + } +} +import custom/*.server +"@ + $patch = @{data = @{Corefile = $newCorefile}} | ConvertTo-Json -Compress -Depth 5 + kubectl patch configmap coredns -n kube-system --type merge -p $patch | Out-Null + kubectl rollout restart deployment/coredns -n kube-system | Out-Null + kubectl rollout status deployment/coredns -n kube-system --timeout=60s | Out-Null + Log-Info "CoreDNS block applied for: $($Hosts -join ', ')" +} + +function Run-ConnectTest { + param([string]$ClusterName, [string]$TestDescription) + Log-Info "Running: az connectedk8s connect -g $ResourceGroup -n $ClusterName" + $output = az connectedk8s connect -g $ResourceGroup -n $ClusterName --location $Location 2>&1 + $exitCode = $LASTEXITCODE + + $telemetryLines = $output | Where-Object { $_ -match "\[Telemetry\]" } + $resultLines = $output | Where-Object { $_ -match "Pre-onboarding Diagnostic|Precheck summary|pre-checks|required pre-checks" } + + Write-Host "`n ── Output excerpt ──" + $resultLines | ForEach-Object { Write-Host " $_" -ForegroundColor Yellow } + $telemetryLines | ForEach-Object { Write-Host " $_" -ForegroundColor Magenta } + + if ($exitCode -ne 0) { + Log-Pass "$TestDescription → command failed as expected (exit $exitCode)" + if (-not $telemetryLines) { + Write-Host " WARNING: No [Telemetry] line found in output." -ForegroundColor DarkYellow + } + } else { + Log-Fail "$TestDescription → command SUCCEEDED but was expected to FAIL" + } +} + +function Cleanup-AzResource { + param([string]$ClusterName) + Log-Info "Cleaning up ARM resource: $ClusterName (if it exists)" + az connectedk8s delete -g $ResourceGroup -n $ClusterName --force -y 2>&1 | Out-Null +} + +function Apply-BadCRD { + param([string]$CRDName) + $manifest = @" +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: $CRDName + annotations: + meta.helm.sh/release-name: some-other-component + meta.helm.sh/release-namespace: default +spec: + group: clusterconfig.azure.com + names: + kind: FakeResource + listKind: FakeResourceList + plural: $(($CRDName -split '\.')[0]) + singular: fakeresource + scope: Cluster + versions: + - name: v1 + served: true + storage: true + schema: + openAPIV3Schema: + type: object +"@ + $manifest | kubectl apply -f - 2>&1 | Out-Null + Log-Info "Bad CRD applied: $CRDName" +} + +function Remove-CRD { + param([string]$CRDName) + kubectl delete crd $CRDName --ignore-not-found=true 2>&1 | Out-Null + Log-Info "CRD removed: $CRDName" +} + +function Apply-PodQuota { + $quota = @" +apiVersion: v1 +kind: ResourceQuota +metadata: + name: block-pods + namespace: azure-arc-release +spec: + hard: + pods: "0" +"@ + kubectl create namespace azure-arc-release --dry-run=client -o yaml | kubectl apply -f - 2>&1 | Out-Null + $quota | kubectl apply -f - 2>&1 | Out-Null + Log-Info "ResourceQuota applied: pods=0 in azure-arc-release" +} + +function Remove-PodQuota { + kubectl delete resourcequota block-pods -n azure-arc-release --ignore-not-found=true 2>&1 | Out-Null + Log-Info "ResourceQuota removed." +} + +# ── Main ───────────────────────────────────────────────────────────────────── + +$results = @() + +Save-CoreDNS + +# ───────────────────────────────────────────────────────────────────────────── +Log-Sep +Log-Info "TEST 1: Block MCR (outbound connectivity failure)" +Log-Info "Expected telemetry: onboardingErrorType=prediagnostics-failure, outboundConnectivityCheck=Failed" +# ───────────────────────────────────────────────────────────────────────────── +$clusterName = "adblocktest-mcr" +Cleanup-AzResource $clusterName +Apply-CoreDNS-Block -Hosts @("mcr.microsoft.com") +Run-ConnectTest -ClusterName $clusterName -TestDescription "MCR outbound block" +Restore-CoreDNS +Cleanup-AzResource $clusterName +$results += "TEST 1 - MCR Block" + +# ───────────────────────────────────────────────────────────────────────────── +Log-Sep +Log-Info "TEST 2: Block Entra auth endpoint (Entra check failure)" +Log-Info "Expected telemetry: onboardingErrorType=prediagnostics-failure, entraCheck=Failed" +# ───────────────────────────────────────────────────────────────────────────── +$clusterName = "adblocktest-entra" +Cleanup-AzResource $clusterName +Apply-CoreDNS-Block -Hosts @("login.microsoftonline.com") +Run-ConnectTest -ClusterName $clusterName -TestDescription "Entra endpoint block" +Restore-CoreDNS +Cleanup-AzResource $clusterName +$results += "TEST 2 - Entra Block" + +# ───────────────────────────────────────────────────────────────────────────── +Log-Sep +Log-Info "TEST 3: Block BOTH MCR + Entra (combined outbound failure)" +Log-Info "Expected telemetry: onboardingErrorType=prediagnostics-failure, outboundConnectivityCheck=Failed, entraCheck=Failed" +# ───────────────────────────────────────────────────────────────────────────── +$clusterName = "adblocktest-all-outbound" +Cleanup-AzResource $clusterName +Apply-CoreDNS-Block -Hosts @("mcr.microsoft.com", "login.microsoftonline.com") +Run-ConnectTest -ClusterName $clusterName -TestDescription "MCR + Entra combined block" +Restore-CoreDNS +Cleanup-AzResource $clusterName +$results += "TEST 3 - MCR + Entra Block" + +# ───────────────────────────────────────────────────────────────────────────── +Log-Sep +Log-Info "TEST 4: CRD ownership conflict (crdCheck failure)" +Log-Info "Expected telemetry: onboardingErrorType=prediagnostics-failure, crdCheck=Failed" +# ───────────────────────────────────────────────────────────────────────────── +$clusterName = "adblocktest-crd" +Cleanup-AzResource $clusterName +Apply-BadCRD "extensionconfigs.clusterconfig.azure.com" +Run-ConnectTest -ClusterName $clusterName -TestDescription "CRD ownership conflict" +Remove-CRD "extensionconfigs.clusterconfig.azure.com" +Cleanup-AzResource $clusterName +$results += "TEST 4 - CRD Conflict" + +# ───────────────────────────────────────────────────────────────────────────── +Log-Sep +Log-Info "TEST 5: Job cannot be scheduled (ResourceQuota pods=0)" +Log-Info "Expected telemetry: onboardingErrorType=prediagnostics-job-execution-error, jobExecutionStatus=NotScheduled" +# ───────────────────────────────────────────────────────────────────────────── +$clusterName = "adblocktest-nojob" +Cleanup-AzResource $clusterName +Apply-PodQuota +Run-ConnectTest -ClusterName $clusterName -TestDescription "Job not schedulable" +Remove-PodQuota +Cleanup-AzResource $clusterName +$results += "TEST 5 - Job Not Schedulable" + +# ───────────────────────────────────────────────────────────────────────────── +Log-Sep +Log-Info "TEST 6: Happy path (all checks pass — command should SUCCEED)" +Log-Info "Expected: no [Telemetry] failure lines, 'pre-checks have succeeded'" +# ───────────────────────────────────────────────────────────────────────────── +$clusterName = "adblocktest-happy" +Log-Info "Running: az connectedk8s connect -g $ResourceGroup -n $clusterName" +$output = az connectedk8s connect -g $ResourceGroup -n $clusterName --location $Location 2>&1 +$exitCode = $LASTEXITCODE +$telemetryFailLines = $output | Where-Object { $_ -match "\[Telemetry\].*prediagnostics" } +if ($exitCode -eq 0 -and -not $telemetryFailLines) { + Log-Pass "Happy path → command succeeded, no failure telemetry" +} elseif ($exitCode -eq 0 -and $telemetryFailLines) { + Log-Fail "Happy path → command succeeded BUT unexpected [Telemetry] failure lines found:" + $telemetryFailLines | ForEach-Object { Write-Host " $_" -ForegroundColor Red } +} else { + Log-Fail "Happy path → command FAILED unexpectedly (exit $exitCode)" +} +Cleanup-AzResource $clusterName +$results += "TEST 6 - Happy Path" + +# ───────────────────────────────────────────────────────────────────────────── +Log-Sep +Write-Host "`nTest run complete. Scenarios executed:" -ForegroundColor White +$results | ForEach-Object { Write-Host " • $_" -ForegroundColor Gray } +Log-Sep diff --git a/testing/test_prediagnostic_telemetry.sh b/testing/test_prediagnostic_telemetry.sh new file mode 100644 index 00000000000..decd421b18c --- /dev/null +++ b/testing/test_prediagnostic_telemetry.sh @@ -0,0 +1,277 @@ +#!/bin/bash +# test_prediagnostic_telemetry.sh +# Exercises all prediagnostic failing scenarios and verifies az connectedk8s connect fails. +# Usage: bash test_prediagnostic_telemetry.sh [resource_group] [location] +# Prerequisites: kubectl configured, az cli with connectedk8s extension installed, kubeconfig set. + +RESOURCE_GROUP="${1:-audittest}" +LOCATION="${2:-eastus2euap}" +ORIGINAL_COREFILE="" +PASS_COUNT=0 +FAIL_COUNT=0 + +# ── Colors ─────────────────────────────────────────────────────────────────── +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +MAGENTA='\033[0;35m' +GRAY='\033[0;37m' +NC='\033[0m' # No Color + +log_info() { echo -e "${CYAN}[INFO]${NC} $1"; } +log_pass() { echo -e "${GREEN}[PASS]${NC} $1"; ((PASS_COUNT++)); } +log_fail() { echo -e "${RED}[FAIL]${NC} $1"; ((FAIL_COUNT++)); } +log_sep() { echo -e "\n${GRAY}$(printf '─%.0s' {1..70})${NC}"; } + +# ── CoreDNS helpers ────────────────────────────────────────────────────────── + +save_coredns() { + ORIGINAL_COREFILE=$(kubectl get configmap coredns -n kube-system -o jsonpath='{.data.Corefile}' 2>&1) + log_info "CoreDNS original config saved." +} + +restore_coredns() { + if [[ -z "$ORIGINAL_COREFILE" ]]; then return; fi + log_info "Restoring CoreDNS..." + kubectl patch configmap coredns -n kube-system --type merge \ + -p "{\"data\":{\"Corefile\":$(echo "$ORIGINAL_COREFILE" | python3 -c 'import json,sys; print(json.dumps(sys.stdin.read()))')}}" \ + > /dev/null 2>&1 + kubectl rollout restart deployment/coredns -n kube-system > /dev/null 2>&1 + kubectl rollout status deployment/coredns -n kube-system --timeout=60s > /dev/null 2>&1 + log_info "CoreDNS restored." +} + +# Apply a CoreDNS hosts block redirecting specified hostnames to 192.0.2.1 (black-hole) +# Usage: apply_coredns_block "host1 host2 ..." +apply_coredns_block() { + local hosts_entries="" + for host in $1; do + hosts_entries+=" 192.0.2.1 ${host}\n" + done + + local new_corefile + new_corefile=$(cat < /dev/null 2>&1 + kubectl rollout restart deployment/coredns -n kube-system > /dev/null 2>&1 + kubectl rollout status deployment/coredns -n kube-system --timeout=60s > /dev/null 2>&1 + log_info "CoreDNS block applied for: $1" +} + +# ── Test runner ────────────────────────────────────────────────────────────── + +run_connect_test() { + local cluster_name="$1" + local test_desc="$2" + + log_info "Running: az connectedk8s connect -g $RESOURCE_GROUP -n $cluster_name" + output=$(az connectedk8s connect -g "$RESOURCE_GROUP" -n "$cluster_name" --location "$LOCATION" 2>&1) + exit_code=$? + + echo "" + echo " ── Output excerpt ──" + echo "$output" | grep -E "Pre-onboarding Diagnostic|Precheck summary|pre-checks|required pre-checks" \ + | while IFS= read -r line; do echo -e " ${YELLOW}${line}${NC}"; done + echo "$output" | grep "\[Telemetry\]" \ + | while IFS= read -r line; do echo -e " ${MAGENTA}${line}${NC}"; done + + if [[ $exit_code -ne 0 ]]; then + log_pass "$test_desc → command failed as expected (exit $exit_code)" + if ! echo "$output" | grep -q "\[Telemetry\]"; then + echo -e " ${YELLOW}WARNING: No [Telemetry] line found in output.${NC}" + fi + else + log_fail "$test_desc → command SUCCEEDED but was expected to FAIL" + fi +} + +cleanup_az_resource() { + local cluster_name="$1" + log_info "Cleaning up ARM resource: $cluster_name (if it exists)" + az connectedk8s delete -g "$RESOURCE_GROUP" -n "$cluster_name" --force -y > /dev/null 2>&1 +} + +apply_bad_crd() { + local crd_name="$1" + kubectl apply -f - > /dev/null 2>&1 < /dev/null 2>&1 + log_info "CRD removed: $1" +} + +apply_pod_quota() { + kubectl create namespace azure-arc-release --dry-run=client -o yaml | kubectl apply -f - > /dev/null 2>&1 + kubectl apply -f - > /dev/null 2>&1 < /dev/null 2>&1 + log_info "ResourceQuota removed." +} + +# ── Main ───────────────────────────────────────────────────────────────────── + +echo -e "\n${CYAN}Pre-onboarding Diagnostic Telemetry Test Suite${NC}" +echo -e "${CYAN}Resource Group: $RESOURCE_GROUP | Location: $LOCATION${NC}" + +save_coredns + +# ───────────────────────────────────────────────────────────────────────────── +log_sep +log_info "TEST 1: Block MCR (outbound connectivity failure)" +log_info "Expected telemetry: onboardingErrorType=prediagnostics-failure, outboundConnectivityCheck=Failed" +# ───────────────────────────────────────────────────────────────────────────── +CLUSTER="adblocktest-mcr" +cleanup_az_resource "$CLUSTER" +apply_coredns_block "mcr.microsoft.com" +run_connect_test "$CLUSTER" "MCR outbound block" +restore_coredns +cleanup_az_resource "$CLUSTER" + +# ───────────────────────────────────────────────────────────────────────────── +log_sep +log_info "TEST 2: Block Entra auth endpoint (Entra check failure)" +log_info "Expected telemetry: onboardingErrorType=prediagnostics-failure, entraCheck=Failed" +# ───────────────────────────────────────────────────────────────────────────── +CLUSTER="adblocktest-entra" +cleanup_az_resource "$CLUSTER" +apply_coredns_block "login.microsoftonline.com" +run_connect_test "$CLUSTER" "Entra endpoint block" +restore_coredns +cleanup_az_resource "$CLUSTER" + +# ───────────────────────────────────────────────────────────────────────────── +log_sep +log_info "TEST 3: Block BOTH MCR + Entra (combined outbound failure)" +log_info "Expected telemetry: outboundConnectivityCheck=Failed, entraCheck=Failed" +# ───────────────────────────────────────────────────────────────────────────── +CLUSTER="adblocktest-all-outbound" +cleanup_az_resource "$CLUSTER" +apply_coredns_block "mcr.microsoft.com login.microsoftonline.com" +run_connect_test "$CLUSTER" "MCR + Entra combined block" +restore_coredns +cleanup_az_resource "$CLUSTER" + +# ───────────────────────────────────────────────────────────────────────────── +log_sep +log_info "TEST 4: CRD ownership conflict (crdCheck failure)" +log_info "Expected telemetry: onboardingErrorType=prediagnostics-failure, crdCheck=Failed" +# ───────────────────────────────────────────────────────────────────────────── +CLUSTER="adblocktest-crd" +cleanup_az_resource "$CLUSTER" +apply_bad_crd "extensionconfigs.clusterconfig.azure.com" +run_connect_test "$CLUSTER" "CRD ownership conflict" +remove_crd "extensionconfigs.clusterconfig.azure.com" +cleanup_az_resource "$CLUSTER" + +# ───────────────────────────────────────────────────────────────────────────── +log_sep +log_info "TEST 5: Job cannot be scheduled (ResourceQuota pods=0)" +log_info "Expected telemetry: onboardingErrorType=prediagnostics-job-execution-error, jobExecutionStatus=NotScheduled" +# ───────────────────────────────────────────────────────────────────────────── +CLUSTER="adblocktest-nojob" +cleanup_az_resource "$CLUSTER" +apply_pod_quota +run_connect_test "$CLUSTER" "Job not schedulable" +remove_pod_quota +cleanup_az_resource "$CLUSTER" + +# ───────────────────────────────────────────────────────────────────────────── +log_sep +log_info "TEST 6: Happy path (all checks pass — command should SUCCEED)" +log_info "Expected: no [Telemetry] failure lines, command exits 0" +# ───────────────────────────────────────────────────────────────────────────── +CLUSTER="adblocktest-happy" +log_info "Running: az connectedk8s connect -g $RESOURCE_GROUP -n $CLUSTER" +output=$(az connectedk8s connect -g "$RESOURCE_GROUP" -n "$CLUSTER" --location "$LOCATION" 2>&1) +exit_code=$? +telemetry_fail=$(echo "$output" | grep "\[Telemetry\].*prediagnostics") + +if [[ $exit_code -eq 0 && -z "$telemetry_fail" ]]; then + log_pass "Happy path → command succeeded, no failure telemetry" +elif [[ $exit_code -eq 0 && -n "$telemetry_fail" ]]; then + log_fail "Happy path → command succeeded BUT unexpected [Telemetry] failure lines found:" + echo "$telemetry_fail" | while IFS= read -r line; do echo -e " ${RED}${line}${NC}"; done +else + log_fail "Happy path → command FAILED unexpectedly (exit $exit_code)" +fi +cleanup_az_resource "$CLUSTER" + +# ───────────────────────────────────────────────────────────────────────────── +log_sep +echo "" +echo -e "${CYAN}Test run complete.${NC}" +echo -e " ${GREEN}Passed: $PASS_COUNT${NC}" +echo -e " ${RED}Failed: $FAIL_COUNT${NC}" +log_sep From 96e11454d92fbb4467b7eb7fb88ab82373e02956 Mon Sep 17 00:00:00 2001 From: Atchut Kumar Barli Date: Wed, 15 Apr 2026 21:46:12 -0700 Subject: [PATCH 07/15] fix: prediagnostic telemetry improvements - Fix empty-log NotCompleted returning Passed instead of Incomplete - Fix double telemetry firing (job-execution-error + check-failure) - Fix NotApplicable for all checks when pod never ran - Fix Entra/CRD error message parsing (filter non-error lines) - Trim multi-line error messages to first line - Add always-save log for completed jobs - Add console diagnostic output block - Add State 3 telemetry for LinuxNodeExists and ClusterRoleBindings - Add Post_Diagnostic_Precheck_Fault_Type constant Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../azext_connectedk8s/_constants.py | 2 +- .../azext_connectedk8s/_precheckutils.py | 73 +++++++++++++++++-- src/connectedk8s/azext_connectedk8s/custom.py | 7 +- 3 files changed, 71 insertions(+), 11 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_constants.py b/src/connectedk8s/azext_connectedk8s/_constants.py index 7eb6937e828..d74d483d5dc 100644 --- a/src/connectedk8s/azext_connectedk8s/_constants.py +++ b/src/connectedk8s/azext_connectedk8s/_constants.py @@ -421,7 +421,7 @@ # Connect Precheck Diagnoser constants Cluster_Diagnostic_Checks_Job_Registry_Path = ( - "azurearck8s/helmchart/stable/clusterdiagnosticchecks:1.31.2" + "azurearck8s/helmchart/stable/clusterdiagnosticchecks:1.33.0" ) Cluster_Diagnostic_Checks_Helm_Install_Failed_Fault_Type = ( "Error while installing cluster diagnostic checks helm release" diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index a9848bde7a8..1b326ae1f56 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -62,16 +62,15 @@ def send_prediagnostic_check_failure_telemetry( crd_error = "" for msg in diagnoser_output: msg_lower = msg.lower() - # Capture DNS-specific errors + # Capture first line of each error message to keep telemetry concise if dns_check == "Failed" and "dns" in msg_lower and "error" in msg_lower: - dns_error = msg.strip() - # Capture outbound connectivity errors + dns_error = msg.strip().splitlines()[0] if outbound_connectivity_check == "Failed" and "outbound" in msg_lower and "error" in msg_lower: - outbound_error = msg.strip() - if prediagnostic_entra_check == "Failed" and "entra" in msg_lower: - entra_error = msg.strip() - if prediagnostic_crd_check == "Failed" and "crd" in msg_lower: - crd_error = msg.strip() + outbound_error = msg.strip().splitlines()[0] + if prediagnostic_entra_check == "Failed" and "entra" in msg_lower and "error" in msg_lower: + entra_error = msg.strip().splitlines()[0] + if prediagnostic_crd_check == "Failed" and "crd" in msg_lower and "error" in msg_lower: + crd_error = msg.strip().splitlines()[0] check_results = { "dnsCheck": dns_check, @@ -252,6 +251,21 @@ def fetch_diagnostic_checks_results( else: prediagnostic_crd_check = consts.Diagnostic_Check_Passed else: + # Empty log — if job didn't complete (e.g., pod never scheduled), treat as Incomplete not Passed + if prediagnostic_job_execution_status == "NotCompleted": + # Mark all individual checks as NotApplicable since the pod never produced output + prediagnostic_dns_check = "NotApplicable" + prediagnostic_outbound_check = "NotApplicable" + prediagnostic_entra_check = "NotApplicable" + prediagnostic_crd_check = "NotApplicable" + diagnoser_output.append( + "Precheck summary: " + f"jobExecutionStatus={prediagnostic_job_execution_status}; " + f"dnsCheck={prediagnostic_dns_check}; outboundConnectivityCheck={prediagnostic_outbound_check}; " + f"entraCheck={prediagnostic_entra_check}; crdCheck={prediagnostic_crd_check}" + ) + send_prediagnostic_job_execution_error_telemetry() + return consts.Diagnostic_Check_Incomplete, storage_space_available return consts.Diagnostic_Check_Passed, storage_space_available diagnoser_output.append( @@ -563,6 +577,49 @@ def executing_cluster_diagnostic_checks_job( "possible reasons can be resource constraints on the cluster.\n" ) + # Fetch and save container logs when job completed successfully (always save for diagnostics) + if is_job_complete: + all_pods = corev1_api_instance.list_namespaced_pod("azure-arc-release") + for each_pod in all_pods.items: + pod_name = each_pod.metadata.name + if not pod_name.startswith(job_name): + continue + try: + cluster_diagnostic_checks_container_log = ( + corev1_api_instance.read_namespaced_pod_log( + name=pod_name, + container="cluster-diagnostic-checks-container", + namespace="azure-arc-release", + ) + ) + if storage_space_available: + log_path = os.path.join( + filepath_with_timestamp, + "cluster_diagnostic_checks_job_log.txt", + ) + with open(log_path, "w+") as f: + f.write(cluster_diagnostic_checks_container_log) + except OSError as e: + if "[Errno 28]" in str(e): + storage_space_available = False + telemetry.set_exception( + exception=e, + fault_type=consts.No_Storage_Space_Available_Fault_Type, + summary="No space left on device", + ) + shutil.rmtree(filepath_with_timestamp, ignore_errors=False) + else: + logger.exception( + "An exception has occured while saving the Cluster " + "Diagnostic Checks Job logs in the local machine." + ) + except Exception as e: + logger.exception( + "An exception has occured while saving the Cluster " + "Diagnostic Checks Job logs in the local machine." + ) + break + # Clearing all the resources after fetching the cluster diagnostic checks container logs Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index e2cd6e8f0cc..6f3cfd8c92e 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -412,12 +412,15 @@ def create_connectedk8s( precheck_failure_summary_msg = ( f" Details: {precheck_failure_summary}" if precheck_failure_summary else "" ) - if precheckutils.prediagnostic_job_execution_status in ("Completed", "NotCompleted"): + if precheckutils.prediagnostic_job_execution_status == "Completed" or ( + precheckutils.prediagnostic_job_execution_status == "NotCompleted" + and precheckutils.prediagnostic_dns_check != "NotApplicable" + ): precheckutils.send_prediagnostic_check_failure_telemetry( precheckutils.prediagnostic_dns_check, precheckutils.prediagnostic_outbound_check, ) - else: + elif precheckutils.prediagnostic_job_execution_status not in ("Completed", "NotCompleted"): precheckutils.send_prediagnostic_job_execution_error_telemetry() if storage_space_available: logger.warning( From 0005d2376cdd7aa50dc8814f0cab10c77f2cd977 Mon Sep 17 00:00:00 2001 From: Atchut Kumar Barli Date: Wed, 15 Apr 2026 21:54:12 -0700 Subject: [PATCH 08/15] prompt --- .../azext_connectedk8s/_precheckutils.py | 23 ++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index 1b326ae1f56..01c97b70af9 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -36,6 +36,23 @@ prediagnostic_crd_check = "Starting" +def _debug_add_extension_event(properties: dict) -> None: + """Debug wrapper around telemetry.add_extension_event to surface silent failures.""" + import azure.cli.core.telemetry as _tel_mod # pylint: disable=import-outside-toplevel + session = getattr(_tel_mod, "_session", None) + before = len(getattr(session, "events", [])) if session else -1 + try: + telemetry.add_extension_event("connectedk8s", properties) + except Exception as ex: # pylint: disable=broad-except + print(f"[Telemetry DEBUG] add_extension_event raised: {ex}") + after = len(getattr(session, "events", [])) if session else -1 + print(f"[Telemetry DEBUG] session.events before={before} after={after} (delta={after - before})") + if session: + extra = getattr(session, "_extension_events", None) or getattr(session, "extra_events", None) + print(f"[Telemetry DEBUG] extension_events count: {len(extra) if extra is not None else 'N/A'}") + print(f"[Telemetry DEBUG] is_telemetry_enabled: {getattr(_tel_mod, 'is_telemetry_enabled', lambda: 'N/A')()}") + + def send_prediagnostic_job_execution_error_telemetry(reason: str = "") -> None: """Send telemetry when prediagnostic job execution fails.""" error_detail_msg = {"jobExecutionStatus": prediagnostic_job_execution_status} @@ -48,7 +65,7 @@ def send_prediagnostic_job_execution_error_telemetry(reason: str = "") -> None: "Context.Default.AzureCLI.onboardingErrorMessage": error_message, } print(f"[Telemetry] onboardingErrorType={consts.Install_Prediagnostics_Job_Execution_Error_Fault_Type} onboardingErrorMessage={error_message}") - telemetry.add_extension_event("connectedk8s", prediagnostic_error_detail) + _debug_add_extension_event(prediagnostic_error_detail) def send_prediagnostic_check_failure_telemetry( @@ -97,7 +114,7 @@ def send_prediagnostic_check_failure_telemetry( } print(f"[Telemetry] onboardingErrorType={consts.Install_Prediagnostics_Fault_Type} onboardingErrorMessage={error_message}") - telemetry.add_extension_event("connectedk8s", prediagnostic_error_detail) + _debug_add_extension_event(prediagnostic_error_detail) def send_post_diagnostic_precheck_failure_telemetry(check_name: str, reason: str) -> None: @@ -108,7 +125,7 @@ def send_post_diagnostic_precheck_failure_telemetry(check_name: str, reason: str "Context.Default.AzureCLI.onboardingErrorMessage": error_message, } print(f"[Telemetry] onboardingErrorType={consts.Post_Diagnostic_Precheck_Fault_Type} onboardingErrorMessage={error_message}") - telemetry.add_extension_event("connectedk8s", error_detail) + _debug_add_extension_event(error_detail) def get_precheck_failure_summary() -> str: From 638bde845e4b2842b0f20548d4b6084bb2c99388 Mon Sep 17 00:00:00 2001 From: Atchut Kumar Barli Date: Wed, 15 Apr 2026 23:29:49 -0700 Subject: [PATCH 09/15] test: add unit tests for prediagnostic telemetry functions - Add test_precheckutils.py with 14 unit tests covering: - send_prediagnostic_job_execution_error_telemetry (error type, status, reason) - send_prediagnostic_check_failure_telemetry (check results, error extraction, multiline trimming, non-error line filtering) - send_post_diagnostic_precheck_failure_telemetry (error type, check name/reason) - Use sys.modules stubs to avoid heavy runtime dependencies - Fix code corruption in _precheckutils.py (IDE selection leaked into source) - Remove temporary debug helper (_debug_add_extension_event) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../azext_connectedk8s/_precheckutils.py | 23 +- .../tests/unittests/test_precheckutils.py | 476 ++++++++++++++++++ 2 files changed, 479 insertions(+), 20 deletions(-) create mode 100644 src/connectedk8s/azext_connectedk8s/tests/unittests/test_precheckutils.py diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index 01c97b70af9..1b326ae1f56 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -36,23 +36,6 @@ prediagnostic_crd_check = "Starting" -def _debug_add_extension_event(properties: dict) -> None: - """Debug wrapper around telemetry.add_extension_event to surface silent failures.""" - import azure.cli.core.telemetry as _tel_mod # pylint: disable=import-outside-toplevel - session = getattr(_tel_mod, "_session", None) - before = len(getattr(session, "events", [])) if session else -1 - try: - telemetry.add_extension_event("connectedk8s", properties) - except Exception as ex: # pylint: disable=broad-except - print(f"[Telemetry DEBUG] add_extension_event raised: {ex}") - after = len(getattr(session, "events", [])) if session else -1 - print(f"[Telemetry DEBUG] session.events before={before} after={after} (delta={after - before})") - if session: - extra = getattr(session, "_extension_events", None) or getattr(session, "extra_events", None) - print(f"[Telemetry DEBUG] extension_events count: {len(extra) if extra is not None else 'N/A'}") - print(f"[Telemetry DEBUG] is_telemetry_enabled: {getattr(_tel_mod, 'is_telemetry_enabled', lambda: 'N/A')()}") - - def send_prediagnostic_job_execution_error_telemetry(reason: str = "") -> None: """Send telemetry when prediagnostic job execution fails.""" error_detail_msg = {"jobExecutionStatus": prediagnostic_job_execution_status} @@ -65,7 +48,7 @@ def send_prediagnostic_job_execution_error_telemetry(reason: str = "") -> None: "Context.Default.AzureCLI.onboardingErrorMessage": error_message, } print(f"[Telemetry] onboardingErrorType={consts.Install_Prediagnostics_Job_Execution_Error_Fault_Type} onboardingErrorMessage={error_message}") - _debug_add_extension_event(prediagnostic_error_detail) + telemetry.add_extension_event("connectedk8s", prediagnostic_error_detail) def send_prediagnostic_check_failure_telemetry( @@ -114,7 +97,7 @@ def send_prediagnostic_check_failure_telemetry( } print(f"[Telemetry] onboardingErrorType={consts.Install_Prediagnostics_Fault_Type} onboardingErrorMessage={error_message}") - _debug_add_extension_event(prediagnostic_error_detail) + telemetry.add_extension_event("connectedk8s", prediagnostic_error_detail) def send_post_diagnostic_precheck_failure_telemetry(check_name: str, reason: str) -> None: @@ -125,7 +108,7 @@ def send_post_diagnostic_precheck_failure_telemetry(check_name: str, reason: str "Context.Default.AzureCLI.onboardingErrorMessage": error_message, } print(f"[Telemetry] onboardingErrorType={consts.Post_Diagnostic_Precheck_Fault_Type} onboardingErrorMessage={error_message}") - _debug_add_extension_event(error_detail) + telemetry.add_extension_event("connectedk8s", error_detail) def get_precheck_failure_summary() -> str: diff --git a/src/connectedk8s/azext_connectedk8s/tests/unittests/test_precheckutils.py b/src/connectedk8s/azext_connectedk8s/tests/unittests/test_precheckutils.py new file mode 100644 index 00000000000..c2e3ce6da80 --- /dev/null +++ b/src/connectedk8s/azext_connectedk8s/tests/unittests/test_precheckutils.py @@ -0,0 +1,476 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- +"""Unit tests for prediagnostic telemetry functions in _precheckutils.py.""" +from __future__ import annotations + +import json +import os +import sys +from unittest.mock import MagicMock, patch + +import pytest + +# Stub out heavy dependencies before importing the module under test +_STUBS = { + "kubernetes": MagicMock(), + "kubernetes.config": MagicMock(), + "kubernetes.watch": MagicMock(), + "kubernetes.client": MagicMock(), + "kubernetes.client.models": MagicMock(), + "azure": MagicMock(), + "azure.cli": MagicMock(), + "azure.cli.core": MagicMock(), + "azure.cli.core.telemetry": MagicMock(), + "azure.cli.core.azclierror": MagicMock(), + "azure.cli.core.commands": MagicMock(), + "azure.cli.core.commands.client_factory": MagicMock(), + "azure.cli.core.util": MagicMock(), + "azure.cli.core._config": MagicMock(), + "azure.core": MagicMock(), + "azure.core.exceptions": MagicMock(), + "azure.mgmt": MagicMock(), + "azure.mgmt.core": MagicMock(), + "azure.mgmt.core.tools": MagicMock(), + "msrest": MagicMock(), + "msrestazure": MagicMock(), + "knack": MagicMock(), + "knack.log": MagicMock(), + "knack.help_files": MagicMock(), + "knack.util": MagicMock(), + "knack.cli": MagicMock(), + "knack.config": MagicMock(), + "knack.prompting": MagicMock(), + "knack.commands": MagicMock(), + "knack.arguments": MagicMock(), + "knack.events": MagicMock(), + # Stub the sibling module to avoid its transitive imports + "azext_connectedk8s._utils": MagicMock(), +} +for mod, stub in _STUBS.items(): + sys.modules[mod] = stub + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../../.."))) + +import azext_connectedk8s._constants as consts # noqa: E402 +import azext_connectedk8s._precheckutils as precheckutils # noqa: E402 + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _reset_globals(): + """Reset module-level globals to a clean state before each test.""" + precheckutils.diagnoser_output = [] + precheckutils.prediagnostic_job_execution_status = "NotStarted" + precheckutils.prediagnostic_entra_check = "Starting" + precheckutils.prediagnostic_crd_check = "Starting" + + +# --------------------------------------------------------------------------- +# send_prediagnostic_job_execution_error_telemetry +# --------------------------------------------------------------------------- + +class TestSendJobExecutionErrorTelemetry: + def setup_method(self): + _reset_globals() + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_sends_event_with_correct_error_type(self, mock_telemetry): + precheckutils.prediagnostic_job_execution_status = "ExecutionFailed" + precheckutils.send_prediagnostic_job_execution_error_telemetry() + + mock_telemetry.add_extension_event.assert_called_once() + args = mock_telemetry.add_extension_event.call_args + assert args[0][0] == "connectedk8s" + props = args[0][1] + assert props["Context.Default.AzureCLI.onboardingErrorType"] == consts.Install_Prediagnostics_Job_Execution_Error_Fault_Type + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_message_includes_job_execution_status(self, mock_telemetry): + precheckutils.prediagnostic_job_execution_status = "ExecutionFailed" + precheckutils.send_prediagnostic_job_execution_error_telemetry() + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert msg["jobExecutionStatus"] == "ExecutionFailed" + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_message_includes_reason_when_provided(self, mock_telemetry): + precheckutils.prediagnostic_job_execution_status = "NotCompleted" + precheckutils.send_prediagnostic_job_execution_error_telemetry(reason="ImagePullBackOff") + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert msg["reason"] == "ImagePullBackOff" + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_message_omits_reason_when_empty(self, mock_telemetry): + precheckutils.send_prediagnostic_job_execution_error_telemetry() + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert "reason" not in msg + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_message_is_valid_json(self, mock_telemetry): + precheckutils.send_prediagnostic_job_execution_error_telemetry(reason="ContainerCreating") + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert isinstance(msg, dict) + + +# --------------------------------------------------------------------------- +# send_prediagnostic_check_failure_telemetry +# --------------------------------------------------------------------------- + +class TestSendCheckFailureTelemetry: + def setup_method(self): + _reset_globals() + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_sends_event_with_correct_error_type(self, mock_telemetry): + precheckutils.send_prediagnostic_check_failure_telemetry("Passed", "Passed") + + mock_telemetry.add_extension_event.assert_called_once() + props = mock_telemetry.add_extension_event.call_args[0][1] + assert props["Context.Default.AzureCLI.onboardingErrorType"] == consts.Install_Prediagnostics_Fault_Type + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_check_results_in_message(self, mock_telemetry): + precheckutils.prediagnostic_entra_check = "Failed" + precheckutils.prediagnostic_crd_check = "Passed" + precheckutils.send_prediagnostic_check_failure_telemetry("Passed", "Failed") + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert msg["dnsCheck"] == "Passed" + assert msg["outboundConnectivityCheck"] == "Failed" + assert msg["entraCheck"] == "Failed" + assert msg["crdCheck"] == "Passed" + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_entra_error_extracted_from_diagnoser_output(self, mock_telemetry): + precheckutils.prediagnostic_entra_check = "Failed" + precheckutils.diagnoser_output = [ + "Some log line", + "Error: Entra endpoint not reachable. Response code: 000", + ] + precheckutils.send_prediagnostic_check_failure_telemetry("Passed", "Passed") + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert "entraError" in msg + assert "000" in msg["entraError"] + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_dns_error_extracted_from_diagnoser_output(self, mock_telemetry): + precheckutils.diagnoser_output = [ + "DNS error: resolution failed for test.example.com", + ] + precheckutils.send_prediagnostic_check_failure_telemetry("Failed", "Passed") + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert "dnsError" in msg + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_outbound_error_extracted_from_diagnoser_output(self, mock_telemetry): + precheckutils.diagnoser_output = [ + "Outbound connectivity error: MCR not reachable", + ] + precheckutils.send_prediagnostic_check_failure_telemetry("Passed", "Failed") + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert "outboundError" in msg + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_multiline_error_trimmed_to_first_line(self, mock_telemetry): + precheckutils.prediagnostic_entra_check = "Failed" + precheckutils.diagnoser_output = [ + "Error: Entra endpoint error line1\nline2\nline3", + ] + precheckutils.send_prediagnostic_check_failure_telemetry("Passed", "Passed") + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert "\n" not in msg.get("entraError", "") + assert "line1" in msg.get("entraError", "") + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_no_error_detail_when_checks_pass(self, mock_telemetry): + precheckutils.prediagnostic_entra_check = "Passed" + precheckutils.prediagnostic_crd_check = "Passed" + precheckutils.send_prediagnostic_check_failure_telemetry("Passed", "Passed") + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert "dnsError" not in msg + assert "entraError" not in msg + assert "outboundError" not in msg + assert "crdError" not in msg + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_non_error_lines_not_captured(self, mock_telemetry): + """Lines mentioning entra but not 'error' should not be captured.""" + precheckutils.prediagnostic_entra_check = "Failed" + precheckutils.diagnoser_output = [ + "Entra check: starting", + "Entra Authentication Endpoint Connectivity Check Result : https://login.microsoftonline.com : 000", + ] + precheckutils.send_prediagnostic_check_failure_telemetry("Passed", "Passed") + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert "entraError" not in msg + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_crd_error_extracted_from_diagnoser_output(self, mock_telemetry): + precheckutils.prediagnostic_crd_check = "Failed" + precheckutils.diagnoser_output = [ + "CRD ownership error: extensionconfigs.clusterconfig.azure.com owned by another release", + ] + precheckutils.send_prediagnostic_check_failure_telemetry("Passed", "Passed") + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert "crdError" in msg + + +# --------------------------------------------------------------------------- +# send_post_diagnostic_precheck_failure_telemetry +# --------------------------------------------------------------------------- + +class TestSendPostDiagnosticPrecheckFailureTelemetry: + def setup_method(self): + _reset_globals() + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_sends_event_with_correct_error_type(self, mock_telemetry): + precheckutils.send_post_diagnostic_precheck_failure_telemetry("LinuxNodeExists", "No Linux nodes found") + + mock_telemetry.add_extension_event.assert_called_once() + props = mock_telemetry.add_extension_event.call_args[0][1] + assert props["Context.Default.AzureCLI.onboardingErrorType"] == consts.Post_Diagnostic_Precheck_Fault_Type + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_message_includes_check_name_and_reason(self, mock_telemetry): + precheckutils.send_post_diagnostic_precheck_failure_telemetry("ClusterRoleBindings", "Insufficient permissions") + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert msg["checkName"] == "ClusterRoleBindings" + assert msg["reason"] == "Insufficient permissions" + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_message_is_valid_json(self, mock_telemetry): + precheckutils.send_post_diagnostic_precheck_failure_telemetry("SomeCheck", "Some reason") + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert isinstance(msg, dict) + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_different_check_names_produce_separate_events(self, mock_telemetry): + precheckutils.send_post_diagnostic_precheck_failure_telemetry("LinuxNodeExists", "No nodes") + precheckutils.send_post_diagnostic_precheck_failure_telemetry("ClusterRoleBindings", "No perms") + + assert mock_telemetry.add_extension_event.call_count == 2 + calls = mock_telemetry.add_extension_event.call_args_list + msg1 = json.loads(calls[0][0][1]["Context.Default.AzureCLI.onboardingErrorMessage"]) + msg2 = json.loads(calls[1][0][1]["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert msg1["checkName"] == "LinuxNodeExists" + assert msg2["checkName"] == "ClusterRoleBindings" + + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _reset_globals(): + """Reset module-level globals to a clean state before each test.""" + precheckutils.diagnoser_output = [] + precheckutils.prediagnostic_job_execution_status = "NotStarted" + precheckutils.prediagnostic_entra_check = "Starting" + precheckutils.prediagnostic_crd_check = "Starting" + + +# --------------------------------------------------------------------------- +# send_prediagnostic_job_execution_error_telemetry +# --------------------------------------------------------------------------- + +class TestSendJobExecutionErrorTelemetry: + def setup_method(self): + _reset_globals() + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_sends_event_with_correct_error_type(self, mock_telemetry): + precheckutils.prediagnostic_job_execution_status = "ExecutionFailed" + precheckutils.send_prediagnostic_job_execution_error_telemetry() + + mock_telemetry.add_extension_event.assert_called_once() + args = mock_telemetry.add_extension_event.call_args + assert args[0][0] == "connectedk8s" + props = args[0][1] + assert props["Context.Default.AzureCLI.onboardingErrorType"] == consts.Install_Prediagnostics_Job_Execution_Error_Fault_Type + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_message_includes_job_execution_status(self, mock_telemetry): + precheckutils.prediagnostic_job_execution_status = "ExecutionFailed" + precheckutils.send_prediagnostic_job_execution_error_telemetry() + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert msg["jobExecutionStatus"] == "ExecutionFailed" + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_message_includes_reason_when_provided(self, mock_telemetry): + precheckutils.prediagnostic_job_execution_status = "NotCompleted" + precheckutils.send_prediagnostic_job_execution_error_telemetry(reason="ImagePullBackOff") + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert msg["reason"] == "ImagePullBackOff" + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_message_omits_reason_when_empty(self, mock_telemetry): + precheckutils.send_prediagnostic_job_execution_error_telemetry() + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert "reason" not in msg + + +# --------------------------------------------------------------------------- +# send_prediagnostic_check_failure_telemetry +# --------------------------------------------------------------------------- + +class TestSendCheckFailureTelemetry: + def setup_method(self): + _reset_globals() + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_sends_event_with_correct_error_type(self, mock_telemetry): + precheckutils.send_prediagnostic_check_failure_telemetry("Passed", "Passed") + + mock_telemetry.add_extension_event.assert_called_once() + props = mock_telemetry.add_extension_event.call_args[0][1] + assert props["Context.Default.AzureCLI.onboardingErrorType"] == consts.Install_Prediagnostics_Fault_Type + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_check_results_in_message(self, mock_telemetry): + precheckutils.prediagnostic_entra_check = "Failed" + precheckutils.prediagnostic_crd_check = "Passed" + precheckutils.send_prediagnostic_check_failure_telemetry("Passed", "Failed") + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert msg["dnsCheck"] == "Passed" + assert msg["outboundConnectivityCheck"] == "Failed" + assert msg["entraCheck"] == "Failed" + assert msg["crdCheck"] == "Passed" + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_entra_error_extracted_from_diagnoser_output(self, mock_telemetry): + precheckutils.prediagnostic_entra_check = "Failed" + precheckutils.diagnoser_output = [ + "Some log line", + "Error: Entra endpoint not reachable. Response code: 000", + ] + precheckutils.send_prediagnostic_check_failure_telemetry("Passed", "Passed") + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert "entraError" in msg + assert "000" in msg["entraError"] + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_dns_error_extracted_from_diagnoser_output(self, mock_telemetry): + precheckutils.diagnoser_output = [ + "DNS error: resolution failed for test.example.com", + ] + precheckutils.send_prediagnostic_check_failure_telemetry("Failed", "Passed") + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert "dnsError" in msg + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_multiline_error_trimmed_to_first_line(self, mock_telemetry): + precheckutils.prediagnostic_entra_check = "Failed" + precheckutils.diagnoser_output = [ + "Error: Entra endpoint error line1\nline2\nline3", + ] + precheckutils.send_prediagnostic_check_failure_telemetry("Passed", "Passed") + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert "\n" not in msg.get("entraError", "") + assert "line1" in msg.get("entraError", "") + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_no_error_detail_when_checks_pass(self, mock_telemetry): + precheckutils.prediagnostic_entra_check = "Passed" + precheckutils.prediagnostic_crd_check = "Passed" + precheckutils.send_prediagnostic_check_failure_telemetry("Passed", "Passed") + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert "dnsError" not in msg + assert "entraError" not in msg + assert "outboundError" not in msg + assert "crdError" not in msg + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_non_error_lines_not_captured(self, mock_telemetry): + """Lines mentioning entra but not 'error' should not be captured.""" + precheckutils.prediagnostic_entra_check = "Failed" + precheckutils.diagnoser_output = [ + "Entra check: starting", + "Entra Authentication Endpoint Connectivity Check Result : https://login.microsoftonline.com : 000", + ] + precheckutils.send_prediagnostic_check_failure_telemetry("Passed", "Passed") + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert "entraError" not in msg + + +# --------------------------------------------------------------------------- +# send_post_diagnostic_precheck_failure_telemetry +# --------------------------------------------------------------------------- + +class TestSendPostDiagnosticPrecheckFailureTelemetry: + def setup_method(self): + _reset_globals() + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_sends_event_with_correct_error_type(self, mock_telemetry): + precheckutils.send_post_diagnostic_precheck_failure_telemetry("LinuxNodeExists", "No Linux nodes found") + + mock_telemetry.add_extension_event.assert_called_once() + props = mock_telemetry.add_extension_event.call_args[0][1] + assert props["Context.Default.AzureCLI.onboardingErrorType"] == consts.Post_Diagnostic_Precheck_Fault_Type + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_message_includes_check_name_and_reason(self, mock_telemetry): + precheckutils.send_post_diagnostic_precheck_failure_telemetry("ClusterRoleBindings", "Insufficient permissions") + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert msg["checkName"] == "ClusterRoleBindings" + assert msg["reason"] == "Insufficient permissions" + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_message_is_valid_json(self, mock_telemetry): + precheckutils.send_post_diagnostic_precheck_failure_telemetry("SomeCheck", "Some reason") + + props = mock_telemetry.add_extension_event.call_args[0][1] + # Should not raise + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert isinstance(msg, dict) From 045dab9669de48ad8f5d337711171d67e0587510 Mon Sep 17 00:00:00 2001 From: Atchut Kumar Barli Date: Wed, 15 Apr 2026 23:31:51 -0700 Subject: [PATCH 10/15] test: use setdefault for stubs to allow real modules in azdev CI Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../tests/unittests/test_precheckutils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/tests/unittests/test_precheckutils.py b/src/connectedk8s/azext_connectedk8s/tests/unittests/test_precheckutils.py index c2e3ce6da80..50447f3b9da 100644 --- a/src/connectedk8s/azext_connectedk8s/tests/unittests/test_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/tests/unittests/test_precheckutils.py @@ -12,7 +12,9 @@ import pytest -# Stub out heavy dependencies before importing the module under test +# Stub out heavy dependencies before importing the module under test. +# Use setdefault so real modules are preferred when available (e.g. in azdev CI), +# but stubs are used in lightweight environments without full CLI installed. _STUBS = { "kubernetes": MagicMock(), "kubernetes.config": MagicMock(), @@ -49,7 +51,7 @@ "azext_connectedk8s._utils": MagicMock(), } for mod, stub in _STUBS.items(): - sys.modules[mod] = stub + sys.modules.setdefault(mod, stub) sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../../.."))) From 431ab56fad4bc92b53499a0cb381a1b9bdb3f285 Mon Sep 17 00:00:00 2001 From: Atchut Kumar Barli Date: Fri, 17 Apr 2026 16:04:54 -0700 Subject: [PATCH 11/15] linterrors --- .../azext_connectedk8s/_precheckutils.py | 9 +- .../tests/unittests/test_precheckutils.py | 197 +----------------- 2 files changed, 9 insertions(+), 197 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index 1b326ae1f56..6d593360926 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -41,7 +41,7 @@ def send_prediagnostic_job_execution_error_telemetry(reason: str = "") -> None: error_detail_msg = {"jobExecutionStatus": prediagnostic_job_execution_status} if reason: error_detail_msg["reason"] = reason - error_message = json.dumps(error_detail_msg) + error_message = azext_utils.process_helm_error_detail(json.dumps(error_detail_msg)) prediagnostic_error_detail = { "Context.Default.AzureCLI.onboardingErrorType": consts.Install_Prediagnostics_Job_Execution_Error_Fault_Type, @@ -89,7 +89,7 @@ def send_prediagnostic_check_failure_telemetry( if crd_error: check_results["crdError"] = crd_error - error_message = json.dumps(check_results) + error_message = azext_utils.process_helm_error_detail(json.dumps(check_results)) prediagnostic_error_detail = { "Context.Default.AzureCLI.onboardingErrorType": consts.Install_Prediagnostics_Fault_Type, @@ -102,7 +102,7 @@ def send_prediagnostic_check_failure_telemetry( def send_post_diagnostic_precheck_failure_telemetry(check_name: str, reason: str) -> None: """Send telemetry for individual precheck failures that occur after the diagnostic job.""" - error_message = json.dumps({"checkName": check_name, "reason": reason}) + error_message = azext_utils.process_helm_error_detail(json.dumps({"checkName": check_name, "reason": reason})) error_detail = { "Context.Default.AzureCLI.onboardingErrorType": consts.Post_Diagnostic_Precheck_Fault_Type, "Context.Default.AzureCLI.onboardingErrorMessage": error_message, @@ -613,7 +613,7 @@ def executing_cluster_diagnostic_checks_job( "An exception has occured while saving the Cluster " "Diagnostic Checks Job logs in the local machine." ) - except Exception as e: + except Exception: logger.exception( "An exception has occured while saving the Cluster " "Diagnostic Checks Job logs in the local machine." @@ -684,6 +684,7 @@ def helm_install_release_cluster_diagnostic_checks( _, error_helm_install = response_helm_install.communicate() if response_helm_install.returncode != 0: error = error_helm_install.decode("ascii") + error = azext_utils.process_helm_error_detail(error) if "forbidden" in error or "timed out waiting for the condition" in error: telemetry.set_user_fault() diff --git a/src/connectedk8s/azext_connectedk8s/tests/unittests/test_precheckutils.py b/src/connectedk8s/azext_connectedk8s/tests/unittests/test_precheckutils.py index 50447f3b9da..ec6c5c4840e 100644 --- a/src/connectedk8s/azext_connectedk8s/tests/unittests/test_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/tests/unittests/test_precheckutils.py @@ -1,4 +1,4 @@ -# -------------------------------------------------------------------------------------------- +# -------------------------------------------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. See License.txt in the project root for license information. # -------------------------------------------------------------------------------------------- @@ -10,8 +10,6 @@ import sys from unittest.mock import MagicMock, patch -import pytest - # Stub out heavy dependencies before importing the module under test. # Use setdefault so real modules are preferred when available (e.g. in azdev CI), # but stubs are used in lightweight environments without full CLI installed. @@ -53,12 +51,14 @@ for mod, stub in _STUBS.items(): sys.modules.setdefault(mod, stub) +# Make process_helm_error_detail a transparent passthrough so telemetry message assertions work +sys.modules["azext_connectedk8s._utils"].process_helm_error_detail = lambda x: x + sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../../.."))) import azext_connectedk8s._constants as consts # noqa: E402 import azext_connectedk8s._precheckutils as precheckutils # noqa: E402 - # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- @@ -287,192 +287,3 @@ def test_different_check_names_produce_separate_events(self, mock_telemetry): msg2 = json.loads(calls[1][0][1]["Context.Default.AzureCLI.onboardingErrorMessage"]) assert msg1["checkName"] == "LinuxNodeExists" assert msg2["checkName"] == "ClusterRoleBindings" - - - -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- - -def _reset_globals(): - """Reset module-level globals to a clean state before each test.""" - precheckutils.diagnoser_output = [] - precheckutils.prediagnostic_job_execution_status = "NotStarted" - precheckutils.prediagnostic_entra_check = "Starting" - precheckutils.prediagnostic_crd_check = "Starting" - - -# --------------------------------------------------------------------------- -# send_prediagnostic_job_execution_error_telemetry -# --------------------------------------------------------------------------- - -class TestSendJobExecutionErrorTelemetry: - def setup_method(self): - _reset_globals() - - @patch("azext_connectedk8s._precheckutils.telemetry") - def test_sends_event_with_correct_error_type(self, mock_telemetry): - precheckutils.prediagnostic_job_execution_status = "ExecutionFailed" - precheckutils.send_prediagnostic_job_execution_error_telemetry() - - mock_telemetry.add_extension_event.assert_called_once() - args = mock_telemetry.add_extension_event.call_args - assert args[0][0] == "connectedk8s" - props = args[0][1] - assert props["Context.Default.AzureCLI.onboardingErrorType"] == consts.Install_Prediagnostics_Job_Execution_Error_Fault_Type - - @patch("azext_connectedk8s._precheckutils.telemetry") - def test_message_includes_job_execution_status(self, mock_telemetry): - precheckutils.prediagnostic_job_execution_status = "ExecutionFailed" - precheckutils.send_prediagnostic_job_execution_error_telemetry() - - props = mock_telemetry.add_extension_event.call_args[0][1] - msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) - assert msg["jobExecutionStatus"] == "ExecutionFailed" - - @patch("azext_connectedk8s._precheckutils.telemetry") - def test_message_includes_reason_when_provided(self, mock_telemetry): - precheckutils.prediagnostic_job_execution_status = "NotCompleted" - precheckutils.send_prediagnostic_job_execution_error_telemetry(reason="ImagePullBackOff") - - props = mock_telemetry.add_extension_event.call_args[0][1] - msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) - assert msg["reason"] == "ImagePullBackOff" - - @patch("azext_connectedk8s._precheckutils.telemetry") - def test_message_omits_reason_when_empty(self, mock_telemetry): - precheckutils.send_prediagnostic_job_execution_error_telemetry() - - props = mock_telemetry.add_extension_event.call_args[0][1] - msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) - assert "reason" not in msg - - -# --------------------------------------------------------------------------- -# send_prediagnostic_check_failure_telemetry -# --------------------------------------------------------------------------- - -class TestSendCheckFailureTelemetry: - def setup_method(self): - _reset_globals() - - @patch("azext_connectedk8s._precheckutils.telemetry") - def test_sends_event_with_correct_error_type(self, mock_telemetry): - precheckutils.send_prediagnostic_check_failure_telemetry("Passed", "Passed") - - mock_telemetry.add_extension_event.assert_called_once() - props = mock_telemetry.add_extension_event.call_args[0][1] - assert props["Context.Default.AzureCLI.onboardingErrorType"] == consts.Install_Prediagnostics_Fault_Type - - @patch("azext_connectedk8s._precheckutils.telemetry") - def test_check_results_in_message(self, mock_telemetry): - precheckutils.prediagnostic_entra_check = "Failed" - precheckutils.prediagnostic_crd_check = "Passed" - precheckutils.send_prediagnostic_check_failure_telemetry("Passed", "Failed") - - props = mock_telemetry.add_extension_event.call_args[0][1] - msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) - assert msg["dnsCheck"] == "Passed" - assert msg["outboundConnectivityCheck"] == "Failed" - assert msg["entraCheck"] == "Failed" - assert msg["crdCheck"] == "Passed" - - @patch("azext_connectedk8s._precheckutils.telemetry") - def test_entra_error_extracted_from_diagnoser_output(self, mock_telemetry): - precheckutils.prediagnostic_entra_check = "Failed" - precheckutils.diagnoser_output = [ - "Some log line", - "Error: Entra endpoint not reachable. Response code: 000", - ] - precheckutils.send_prediagnostic_check_failure_telemetry("Passed", "Passed") - - props = mock_telemetry.add_extension_event.call_args[0][1] - msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) - assert "entraError" in msg - assert "000" in msg["entraError"] - - @patch("azext_connectedk8s._precheckutils.telemetry") - def test_dns_error_extracted_from_diagnoser_output(self, mock_telemetry): - precheckutils.diagnoser_output = [ - "DNS error: resolution failed for test.example.com", - ] - precheckutils.send_prediagnostic_check_failure_telemetry("Failed", "Passed") - - props = mock_telemetry.add_extension_event.call_args[0][1] - msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) - assert "dnsError" in msg - - @patch("azext_connectedk8s._precheckutils.telemetry") - def test_multiline_error_trimmed_to_first_line(self, mock_telemetry): - precheckutils.prediagnostic_entra_check = "Failed" - precheckutils.diagnoser_output = [ - "Error: Entra endpoint error line1\nline2\nline3", - ] - precheckutils.send_prediagnostic_check_failure_telemetry("Passed", "Passed") - - props = mock_telemetry.add_extension_event.call_args[0][1] - msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) - assert "\n" not in msg.get("entraError", "") - assert "line1" in msg.get("entraError", "") - - @patch("azext_connectedk8s._precheckutils.telemetry") - def test_no_error_detail_when_checks_pass(self, mock_telemetry): - precheckutils.prediagnostic_entra_check = "Passed" - precheckutils.prediagnostic_crd_check = "Passed" - precheckutils.send_prediagnostic_check_failure_telemetry("Passed", "Passed") - - props = mock_telemetry.add_extension_event.call_args[0][1] - msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) - assert "dnsError" not in msg - assert "entraError" not in msg - assert "outboundError" not in msg - assert "crdError" not in msg - - @patch("azext_connectedk8s._precheckutils.telemetry") - def test_non_error_lines_not_captured(self, mock_telemetry): - """Lines mentioning entra but not 'error' should not be captured.""" - precheckutils.prediagnostic_entra_check = "Failed" - precheckutils.diagnoser_output = [ - "Entra check: starting", - "Entra Authentication Endpoint Connectivity Check Result : https://login.microsoftonline.com : 000", - ] - precheckutils.send_prediagnostic_check_failure_telemetry("Passed", "Passed") - - props = mock_telemetry.add_extension_event.call_args[0][1] - msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) - assert "entraError" not in msg - - -# --------------------------------------------------------------------------- -# send_post_diagnostic_precheck_failure_telemetry -# --------------------------------------------------------------------------- - -class TestSendPostDiagnosticPrecheckFailureTelemetry: - def setup_method(self): - _reset_globals() - - @patch("azext_connectedk8s._precheckutils.telemetry") - def test_sends_event_with_correct_error_type(self, mock_telemetry): - precheckutils.send_post_diagnostic_precheck_failure_telemetry("LinuxNodeExists", "No Linux nodes found") - - mock_telemetry.add_extension_event.assert_called_once() - props = mock_telemetry.add_extension_event.call_args[0][1] - assert props["Context.Default.AzureCLI.onboardingErrorType"] == consts.Post_Diagnostic_Precheck_Fault_Type - - @patch("azext_connectedk8s._precheckutils.telemetry") - def test_message_includes_check_name_and_reason(self, mock_telemetry): - precheckutils.send_post_diagnostic_precheck_failure_telemetry("ClusterRoleBindings", "Insufficient permissions") - - props = mock_telemetry.add_extension_event.call_args[0][1] - msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) - assert msg["checkName"] == "ClusterRoleBindings" - assert msg["reason"] == "Insufficient permissions" - - @patch("azext_connectedk8s._precheckutils.telemetry") - def test_message_is_valid_json(self, mock_telemetry): - precheckutils.send_post_diagnostic_precheck_failure_telemetry("SomeCheck", "Some reason") - - props = mock_telemetry.add_extension_event.call_args[0][1] - # Should not raise - msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) - assert isinstance(msg, dict) From 259e61cd3703726ad81694de444e5ff5bbfe575b Mon Sep 17 00:00:00 2001 From: Atchut Kumar Barli Date: Fri, 17 Apr 2026 16:19:42 -0700 Subject: [PATCH 12/15] linterrors --- .../azext_connectedk8s/_constants.py | 8 ++- .../azext_connectedk8s/_precheckutils.py | 46 +++++++++++---- src/connectedk8s/azext_connectedk8s/custom.py | 5 +- .../tests/unittests/test_precheckutils.py | 58 ++++++++++++++----- 4 files changed, 90 insertions(+), 27 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_constants.py b/src/connectedk8s/azext_connectedk8s/_constants.py index 3b61146b335..a3baa91d462 100644 --- a/src/connectedk8s/azext_connectedk8s/_constants.py +++ b/src/connectedk8s/azext_connectedk8s/_constants.py @@ -125,7 +125,9 @@ PrivateKey_Export_Fault_Type = "privatekey-export-error" Install_HelmRelease_Fault_Type = "helm-release-install-error" Install_Prediagnostics_Fault_Type = "prediagnostics-failure" -Install_Prediagnostics_Job_Execution_Error_Fault_Type = "prediagnostics-job-execution-error" +Install_Prediagnostics_Job_Execution_Error_Fault_Type = ( + "prediagnostics-job-execution-error" +) Post_Diagnostic_Precheck_Fault_Type = "post-diagnostic-precheck-failure" Delete_HelmRelease_Fault_Type = "helm-release-delete-error" Check_PodStatus_Fault_Type = "check-pod-status-error" @@ -478,7 +480,9 @@ "Outbound network connectivity check failed for Cluster Connect" ) DNS_Check_Result_String = "DNS Result:" -Entra_Connectivity_Check_Result_String = "Entra Authentication Endpoint Connectivity Check Result" +Entra_Connectivity_Check_Result_String = ( + "Entra Authentication Endpoint Connectivity Check Result" +) CRD_Ownership_Check_Failed_String = "Check Failed: CRD" AZ_CLI_ADAL_TO_MSAL_MIGRATE_VERSION = "2.30.0" CLIENT_PROXY_VERSION = "1.3.033581" diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index 6d593360926..76b5b91b74b 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -47,7 +47,6 @@ def send_prediagnostic_job_execution_error_telemetry(reason: str = "") -> None: "Context.Default.AzureCLI.onboardingErrorType": consts.Install_Prediagnostics_Job_Execution_Error_Fault_Type, "Context.Default.AzureCLI.onboardingErrorMessage": error_message, } - print(f"[Telemetry] onboardingErrorType={consts.Install_Prediagnostics_Job_Execution_Error_Fault_Type} onboardingErrorMessage={error_message}") telemetry.add_extension_event("connectedk8s", prediagnostic_error_detail) @@ -65,11 +64,23 @@ def send_prediagnostic_check_failure_telemetry( # Capture first line of each error message to keep telemetry concise if dns_check == "Failed" and "dns" in msg_lower and "error" in msg_lower: dns_error = msg.strip().splitlines()[0] - if outbound_connectivity_check == "Failed" and "outbound" in msg_lower and "error" in msg_lower: + if ( + outbound_connectivity_check == "Failed" + and "outbound" in msg_lower + and "error" in msg_lower + ): outbound_error = msg.strip().splitlines()[0] - if prediagnostic_entra_check == "Failed" and "entra" in msg_lower and "error" in msg_lower: + if ( + prediagnostic_entra_check == "Failed" + and "entra" in msg_lower + and "error" in msg_lower + ): entra_error = msg.strip().splitlines()[0] - if prediagnostic_crd_check == "Failed" and "crd" in msg_lower and "error" in msg_lower: + if ( + prediagnostic_crd_check == "Failed" + and "crd" in msg_lower + and "error" in msg_lower + ): crd_error = msg.strip().splitlines()[0] check_results = { @@ -78,7 +89,7 @@ def send_prediagnostic_check_failure_telemetry( "entraCheck": prediagnostic_entra_check, "crdCheck": prediagnostic_crd_check, } - + # Only add error details if checks actually failed if dns_error: check_results["dnsError"] = dns_error @@ -88,7 +99,7 @@ def send_prediagnostic_check_failure_telemetry( check_results["entraError"] = entra_error if crd_error: check_results["crdError"] = crd_error - + error_message = azext_utils.process_helm_error_detail(json.dumps(check_results)) prediagnostic_error_detail = { @@ -96,18 +107,26 @@ def send_prediagnostic_check_failure_telemetry( "Context.Default.AzureCLI.onboardingErrorMessage": error_message, } - print(f"[Telemetry] onboardingErrorType={consts.Install_Prediagnostics_Fault_Type} onboardingErrorMessage={error_message}") + print( + f"[Telemetry] onboardingErrorType={consts.Install_Prediagnostics_Fault_Type} onboardingErrorMessage={error_message}" + ) telemetry.add_extension_event("connectedk8s", prediagnostic_error_detail) -def send_post_diagnostic_precheck_failure_telemetry(check_name: str, reason: str) -> None: +def send_post_diagnostic_precheck_failure_telemetry( + check_name: str, reason: str +) -> None: """Send telemetry for individual precheck failures that occur after the diagnostic job.""" - error_message = azext_utils.process_helm_error_detail(json.dumps({"checkName": check_name, "reason": reason})) + error_message = azext_utils.process_helm_error_detail( + json.dumps({"checkName": check_name, "reason": reason}) + ) error_detail = { "Context.Default.AzureCLI.onboardingErrorType": consts.Post_Diagnostic_Precheck_Fault_Type, "Context.Default.AzureCLI.onboardingErrorMessage": error_message, } - print(f"[Telemetry] onboardingErrorType={consts.Post_Diagnostic_Precheck_Fault_Type} onboardingErrorMessage={error_message}") + print( + f"[Telemetry] onboardingErrorType={consts.Post_Diagnostic_Precheck_Fault_Type} onboardingErrorMessage={error_message}" + ) telemetry.add_extension_event("connectedk8s", error_detail) @@ -135,7 +154,12 @@ def fetch_diagnostic_checks_results( filepath_with_timestamp: str, storage_space_available: bool, ) -> tuple[str, bool]: - global prediagnostic_job_execution_status, prediagnostic_dns_check, prediagnostic_outbound_check, prediagnostic_entra_check, prediagnostic_crd_check + global \ + prediagnostic_job_execution_status, \ + prediagnostic_dns_check, \ + prediagnostic_outbound_check, \ + prediagnostic_entra_check, \ + prediagnostic_crd_check try: diagnoser_output.clear() prediagnostic_job_execution_status = "NotStarted" diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index 5174928664b..19c5797329e 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -420,7 +420,10 @@ def create_connectedk8s( precheckutils.prediagnostic_dns_check, precheckutils.prediagnostic_outbound_check, ) - elif precheckutils.prediagnostic_job_execution_status not in ("Completed", "NotCompleted"): + elif precheckutils.prediagnostic_job_execution_status not in ( + "Completed", + "NotCompleted", + ): precheckutils.send_prediagnostic_job_execution_error_telemetry() if storage_space_available: logger.warning( diff --git a/src/connectedk8s/azext_connectedk8s/tests/unittests/test_precheckutils.py b/src/connectedk8s/azext_connectedk8s/tests/unittests/test_precheckutils.py index ec6c5c4840e..fa4e69e1d74 100644 --- a/src/connectedk8s/azext_connectedk8s/tests/unittests/test_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/tests/unittests/test_precheckutils.py @@ -1,8 +1,9 @@ -# -------------------------------------------------------------------------------------------- +# -------------------------------------------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. See License.txt in the project root for license information. # -------------------------------------------------------------------------------------------- """Unit tests for prediagnostic telemetry functions in _precheckutils.py.""" + from __future__ import annotations import json @@ -63,6 +64,7 @@ # Helpers # --------------------------------------------------------------------------- + def _reset_globals(): """Reset module-level globals to a clean state before each test.""" precheckutils.diagnoser_output = [] @@ -75,6 +77,7 @@ def _reset_globals(): # send_prediagnostic_job_execution_error_telemetry # --------------------------------------------------------------------------- + class TestSendJobExecutionErrorTelemetry: def setup_method(self): _reset_globals() @@ -88,7 +91,10 @@ def test_sends_event_with_correct_error_type(self, mock_telemetry): args = mock_telemetry.add_extension_event.call_args assert args[0][0] == "connectedk8s" props = args[0][1] - assert props["Context.Default.AzureCLI.onboardingErrorType"] == consts.Install_Prediagnostics_Job_Execution_Error_Fault_Type + assert ( + props["Context.Default.AzureCLI.onboardingErrorType"] + == consts.Install_Prediagnostics_Job_Execution_Error_Fault_Type + ) @patch("azext_connectedk8s._precheckutils.telemetry") def test_message_includes_job_execution_status(self, mock_telemetry): @@ -102,7 +108,9 @@ def test_message_includes_job_execution_status(self, mock_telemetry): @patch("azext_connectedk8s._precheckutils.telemetry") def test_message_includes_reason_when_provided(self, mock_telemetry): precheckutils.prediagnostic_job_execution_status = "NotCompleted" - precheckutils.send_prediagnostic_job_execution_error_telemetry(reason="ImagePullBackOff") + precheckutils.send_prediagnostic_job_execution_error_telemetry( + reason="ImagePullBackOff" + ) props = mock_telemetry.add_extension_event.call_args[0][1] msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) @@ -118,7 +126,9 @@ def test_message_omits_reason_when_empty(self, mock_telemetry): @patch("azext_connectedk8s._precheckutils.telemetry") def test_message_is_valid_json(self, mock_telemetry): - precheckutils.send_prediagnostic_job_execution_error_telemetry(reason="ContainerCreating") + precheckutils.send_prediagnostic_job_execution_error_telemetry( + reason="ContainerCreating" + ) props = mock_telemetry.add_extension_event.call_args[0][1] msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) @@ -129,6 +139,7 @@ def test_message_is_valid_json(self, mock_telemetry): # send_prediagnostic_check_failure_telemetry # --------------------------------------------------------------------------- + class TestSendCheckFailureTelemetry: def setup_method(self): _reset_globals() @@ -139,7 +150,10 @@ def test_sends_event_with_correct_error_type(self, mock_telemetry): mock_telemetry.add_extension_event.assert_called_once() props = mock_telemetry.add_extension_event.call_args[0][1] - assert props["Context.Default.AzureCLI.onboardingErrorType"] == consts.Install_Prediagnostics_Fault_Type + assert ( + props["Context.Default.AzureCLI.onboardingErrorType"] + == consts.Install_Prediagnostics_Fault_Type + ) @patch("azext_connectedk8s._precheckutils.telemetry") def test_check_results_in_message(self, mock_telemetry): @@ -247,21 +261,29 @@ def test_crd_error_extracted_from_diagnoser_output(self, mock_telemetry): # send_post_diagnostic_precheck_failure_telemetry # --------------------------------------------------------------------------- + class TestSendPostDiagnosticPrecheckFailureTelemetry: def setup_method(self): _reset_globals() @patch("azext_connectedk8s._precheckutils.telemetry") def test_sends_event_with_correct_error_type(self, mock_telemetry): - precheckutils.send_post_diagnostic_precheck_failure_telemetry("LinuxNodeExists", "No Linux nodes found") + precheckutils.send_post_diagnostic_precheck_failure_telemetry( + "LinuxNodeExists", "No Linux nodes found" + ) mock_telemetry.add_extension_event.assert_called_once() props = mock_telemetry.add_extension_event.call_args[0][1] - assert props["Context.Default.AzureCLI.onboardingErrorType"] == consts.Post_Diagnostic_Precheck_Fault_Type + assert ( + props["Context.Default.AzureCLI.onboardingErrorType"] + == consts.Post_Diagnostic_Precheck_Fault_Type + ) @patch("azext_connectedk8s._precheckutils.telemetry") def test_message_includes_check_name_and_reason(self, mock_telemetry): - precheckutils.send_post_diagnostic_precheck_failure_telemetry("ClusterRoleBindings", "Insufficient permissions") + precheckutils.send_post_diagnostic_precheck_failure_telemetry( + "ClusterRoleBindings", "Insufficient permissions" + ) props = mock_telemetry.add_extension_event.call_args[0][1] msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) @@ -270,7 +292,9 @@ def test_message_includes_check_name_and_reason(self, mock_telemetry): @patch("azext_connectedk8s._precheckutils.telemetry") def test_message_is_valid_json(self, mock_telemetry): - precheckutils.send_post_diagnostic_precheck_failure_telemetry("SomeCheck", "Some reason") + precheckutils.send_post_diagnostic_precheck_failure_telemetry( + "SomeCheck", "Some reason" + ) props = mock_telemetry.add_extension_event.call_args[0][1] msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) @@ -278,12 +302,20 @@ def test_message_is_valid_json(self, mock_telemetry): @patch("azext_connectedk8s._precheckutils.telemetry") def test_different_check_names_produce_separate_events(self, mock_telemetry): - precheckutils.send_post_diagnostic_precheck_failure_telemetry("LinuxNodeExists", "No nodes") - precheckutils.send_post_diagnostic_precheck_failure_telemetry("ClusterRoleBindings", "No perms") + precheckutils.send_post_diagnostic_precheck_failure_telemetry( + "LinuxNodeExists", "No nodes" + ) + precheckutils.send_post_diagnostic_precheck_failure_telemetry( + "ClusterRoleBindings", "No perms" + ) assert mock_telemetry.add_extension_event.call_count == 2 calls = mock_telemetry.add_extension_event.call_args_list - msg1 = json.loads(calls[0][0][1]["Context.Default.AzureCLI.onboardingErrorMessage"]) - msg2 = json.loads(calls[1][0][1]["Context.Default.AzureCLI.onboardingErrorMessage"]) + msg1 = json.loads( + calls[0][0][1]["Context.Default.AzureCLI.onboardingErrorMessage"] + ) + msg2 = json.loads( + calls[1][0][1]["Context.Default.AzureCLI.onboardingErrorMessage"] + ) assert msg1["checkName"] == "LinuxNodeExists" assert msg2["checkName"] == "ClusterRoleBindings" From a00f41266b6db4c3ccf40d51d49a047233c5da25 Mon Sep 17 00:00:00 2001 From: Atchut Kumar Barli Date: Fri, 17 Apr 2026 18:39:31 -0700 Subject: [PATCH 13/15] testfixes --- .../tests/unittests/test_precheckutils.py | 7 +++++ .../tests/unittests/test_utils_.py | 30 ++++++++++++++++++- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/src/connectedk8s/azext_connectedk8s/tests/unittests/test_precheckutils.py b/src/connectedk8s/azext_connectedk8s/tests/unittests/test_precheckutils.py index fa4e69e1d74..bf6e8be1f6c 100644 --- a/src/connectedk8s/azext_connectedk8s/tests/unittests/test_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/tests/unittests/test_precheckutils.py @@ -49,6 +49,7 @@ # Stub the sibling module to avoid its transitive imports "azext_connectedk8s._utils": MagicMock(), } +_ORIGINAL_MODULES = {mod: sys.modules.get(mod) for mod in _STUBS} for mod, stub in _STUBS.items(): sys.modules.setdefault(mod, stub) @@ -60,6 +61,12 @@ import azext_connectedk8s._constants as consts # noqa: E402 import azext_connectedk8s._precheckutils as precheckutils # noqa: E402 +for mod, original_module in _ORIGINAL_MODULES.items(): + if original_module is None: + sys.modules.pop(mod, None) + else: + sys.modules[mod] = original_module + # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- diff --git a/src/connectedk8s/azext_connectedk8s/tests/unittests/test_utils_.py b/src/connectedk8s/azext_connectedk8s/tests/unittests/test_utils_.py index 32d1da1e3b4..7f3eed89fde 100644 --- a/src/connectedk8s/azext_connectedk8s/tests/unittests/test_utils_.py +++ b/src/connectedk8s/azext_connectedk8s/tests/unittests/test_utils_.py @@ -4,11 +4,39 @@ # -------------------------------------------------------------------------------------------- import os import sys +from unittest.mock import MagicMock import pytest sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../../.."))) -from azext_connectedk8s._utils import ( + +if isinstance(sys.modules.get("azext_connectedk8s._utils"), MagicMock): + sys.modules.pop("azext_connectedk8s._utils", None) + +_STUBS = { + "azure": MagicMock(), + "azure.cli": MagicMock(), + "azure.cli.core": MagicMock(), + "azure.cli.core.azclierror": MagicMock(), + "azure.cli.core.commands": MagicMock(), + "azure.cli.core.commands.client_factory": MagicMock(), + "azure.cli.core.util": MagicMock(), + "azure.core": MagicMock(), + "azure.core.exceptions": MagicMock(), + "knack": MagicMock(), + "knack.log": MagicMock(), + "knack.prompting": MagicMock(), + "kubernetes": MagicMock(), + "kubernetes.client": MagicMock(), + "kubernetes.client.rest": MagicMock(), + "msrest": MagicMock(), + "msrest.exceptions": MagicMock(), + "azext_connectedk8s._client_factory": MagicMock(), +} +for mod, stub in _STUBS.items(): + sys.modules.setdefault(mod, stub) + +from azext_connectedk8s._utils import ( # noqa: E402 get_mcr_path, process_helm_error_detail, redact_sensitive_fields_from_string, From 091fbbce0c8767fad1b7d2fc3c2da46e457b1a2c Mon Sep 17 00:00:00 2001 From: Atchut Kumar Barli Date: Mon, 20 Apr 2026 12:52:17 -0700 Subject: [PATCH 14/15] fix unittest --- .../tests/unittests/test_precheckutils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/tests/unittests/test_precheckutils.py b/src/connectedk8s/azext_connectedk8s/tests/unittests/test_precheckutils.py index bf6e8be1f6c..194559196f4 100644 --- a/src/connectedk8s/azext_connectedk8s/tests/unittests/test_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/tests/unittests/test_precheckutils.py @@ -53,8 +53,12 @@ for mod, stub in _STUBS.items(): sys.modules.setdefault(mod, stub) -# Make process_helm_error_detail a transparent passthrough so telemetry message assertions work -sys.modules["azext_connectedk8s._utils"].process_helm_error_detail = lambda x: x +# Make process_helm_error_detail a transparent passthrough so telemetry message assertions work. +# Only patch if this is our MagicMock stub — if the real module is already loaded (e.g. in full +# azdev CI), patching it here would permanently mutate its attribute on the shared module object. +_utils_stub = sys.modules.get("azext_connectedk8s._utils") +if isinstance(_utils_stub, MagicMock): + _utils_stub.process_helm_error_detail = lambda x: x sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../../.."))) From 59cf5033c0255732365dc20c0894aec60153c018 Mon Sep 17 00:00:00 2001 From: Atchut Kumar Barli Date: Mon, 20 Apr 2026 14:37:05 -0700 Subject: [PATCH 15/15] removingunexpetedfiles --- src/connectedk8s/policy.yam; | 16 ---------------- src/connectedk8s/test.json | 14 -------------- 2 files changed, 30 deletions(-) delete mode 100644 src/connectedk8s/policy.yam; delete mode 100644 src/connectedk8s/test.json diff --git a/src/connectedk8s/policy.yam; b/src/connectedk8s/policy.yam; deleted file mode 100644 index ed5b3ccda76..00000000000 --- a/src/connectedk8s/policy.yam; +++ /dev/null @@ -1,16 +0,0 @@ -apiVersion: networking.k8s.io/v1 -kind: NetworkPolicy -metadata: - name: block-dns - namespace: azure-arc-release -spec: - podSelector: {} # applies to all pods in namespace - policyTypes: - - Egress - egress: - - ports: - - port: 443 # allow HTTPS so only DNS is broken, not everything - protocol: TCP - - port: 80 - protocol: TCP - # port 53 (UDP+TCP) is NOT listed → blocked \ No newline at end of file diff --git a/src/connectedk8s/test.json b/src/connectedk8s/test.json deleted file mode 100644 index 89fd221c196..00000000000 --- a/src/connectedk8s/test.json +++ /dev/null @@ -1,14 +0,0 @@ -{'Context.Default.AzureCLI.onboardingErrorType': 'prediagnostics-job-execution-error', -'Context.Default.AzureCLI.onboardingErrorMessage': - 'jobExecutionStatus=ExecutionFailed; reason=Failed to execute Cluster Diagnostic Checks Job: (400)\nReason: Bad Request\nHTTP response headers: HTTPHeaderDict({\'Audit-Id\': \'5a3b0ffc-444f-4b05-b406-b48889d1c38c\', \'Cache-Control\': \'no-cache, private\', \'Content-Type\': \'application/json\', \'Date\': \'Mon, - 23 Mar 2026 02: 16: 00 GMT\', \'Content-Length\': \'250\' - })\nHTTP response body: { - "kind": "Status", - "apiVersion": "v1", - "metadata": {}, - "status": "Failure", - "message": "container \\"cluster-diagnostic-checks-container\\" in pod \\"cluster-diagnostic-checks-job-vtg9d\\" is waiting to start: ContainerCreating", - "reason": "BadRequest", - "code": 400 - }\n\n', 'Context.Default.AzureCLI.ExtensionName': 'connectedk8s', 'Reserved.DataModel.CorrelationId': 'ed794940-25ba-43bf-86bf-7a9ce8d0fd58' -} \ No newline at end of file