From 144c7c2cbd6bb8929d4c7fd370fca1c92e71edfa Mon Sep 17 00:00:00 2001 From: Qi Ni Date: Thu, 4 Sep 2025 15:21:28 +1000 Subject: [PATCH 01/11] feat: Improve user experience of az aks agent with aks-mcp Enhance the user experience of az aks agent, including: 1. Use aks-mcp by default, offering an opt-out flag --no-aks-mcp. 2. Disable duplicated built-in toolsets when using aks-mcp. 3. Manage the lifecycle of aks-mcp binary, including downloading, updating, health checking and gracefully stopping. 4. Offer status subcommand to display the system status. Refine system prompt. 5. Smart toolset refreshment when switching between mcp and traditional mode. --- src/aks-agent/HISTORY.rst | 11 + src/aks-agent/azext_aks_agent/_consts.py | 8 + src/aks-agent/azext_aks_agent/_help.py | 15 + src/aks-agent/azext_aks_agent/_params.py | 6 + src/aks-agent/azext_aks_agent/agent/agent.py | 686 +++++++++++++++--- .../azext_aks_agent/agent/binary_manager.py | 517 +++++++++++++ .../azext_aks_agent/agent/config_generator.py | 174 +++++ .../azext_aks_agent/agent/error_handler.py | 255 +++++++ .../azext_aks_agent/agent/mcp_manager.py | 335 +++++++++ src/aks-agent/azext_aks_agent/agent/prompt.py | 141 +++- src/aks-agent/azext_aks_agent/agent/status.py | 352 +++++++++ .../azext_aks_agent/agent/status_models.py | 254 +++++++ .../azext_aks_agent/agent/user_feedback.py | 108 +++ src/aks-agent/azext_aks_agent/commands.py | 4 + src/aks-agent/azext_aks_agent/custom.py | 230 ++++++ .../latest/test_aks_agent_binary_manager.py | 602 +++++++++++++++ .../latest/test_aks_agent_config_generator.py | 300 ++++++++ .../latest/test_aks_agent_error_handler.py | 224 ++++++ .../latest/test_aks_agent_mcp_integration.py | 291 ++++++++ .../tests/latest/test_aks_agent_parameters.py | 72 ++ .../latest/test_aks_agent_smart_refresh.py | 218 ++++++ .../tests/latest/test_aks_agent_status.py | 390 ++++++++++ .../latest/test_aks_agent_status_command.py | 223 ++++++ .../latest/test_aks_agent_status_models.py | 474 ++++++++++++ .../tests/latest/test_mcp_manager.py | 393 ++++++++++ .../tests/latest/test_user_feedback.py | 218 ++++++ src/aks-agent/setup.py | 2 +- 27 files changed, 6399 insertions(+), 104 deletions(-) create mode 100644 src/aks-agent/azext_aks_agent/agent/binary_manager.py create mode 100644 src/aks-agent/azext_aks_agent/agent/config_generator.py create mode 100644 src/aks-agent/azext_aks_agent/agent/error_handler.py create mode 100644 src/aks-agent/azext_aks_agent/agent/mcp_manager.py create mode 100644 src/aks-agent/azext_aks_agent/agent/status.py create mode 100644 src/aks-agent/azext_aks_agent/agent/status_models.py create mode 100644 src/aks-agent/azext_aks_agent/agent/user_feedback.py create mode 100644 src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_binary_manager.py create mode 100644 src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_config_generator.py create mode 100644 src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_error_handler.py create mode 100644 src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_mcp_integration.py create mode 100644 src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_parameters.py create mode 100644 src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_smart_refresh.py create mode 100644 src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_status.py create mode 100644 src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_status_command.py create mode 100644 src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_status_models.py create mode 100644 src/aks-agent/azext_aks_agent/tests/latest/test_mcp_manager.py create mode 100644 src/aks-agent/azext_aks_agent/tests/latest/test_user_feedback.py diff --git a/src/aks-agent/HISTORY.rst b/src/aks-agent/HISTORY.rst index f193943bdf5..0fbd9384339 100644 --- a/src/aks-agent/HISTORY.rst +++ b/src/aks-agent/HISTORY.rst @@ -12,6 +12,17 @@ To release a new version, please select a new version number (usually plus 1 to Pending +++++++ +1.0.0b2 ++++++++ + +- Add MCP integration for `az aks agent` with aks-mcp binary management and local server lifecycle (download, version validation, start/stop, health checks). +- Introduce dual-mode operation: MCP mode (enhanced) and Traditional mode (built-in toolsets), with mode-specific system prompts. +- Implement smart toolset refresh strategy with persisted mode state to avoid unnecessary refresh on repeated runs. +- Add `--no-aks-mcp` flag to force Traditional mode when desired. +- Add `az aks agent status` command to display MCP binary availability/version, server health, and overall mode/readiness. +- Add structured error handling with user-friendly messages and actionable suggestions for MCP/binary/server/config errors. +- Port and adapt comprehensive unit tests covering binary manager, MCP manager, configuration generation/validation, status models/collection, error handling, user feedback, parameters, smart refresh, MCP integration, and status command. + 1.0.0b1 +++++++ * Add interactive AI-powered debugging tool `az aks agent`. diff --git a/src/aks-agent/azext_aks_agent/_consts.py b/src/aks-agent/azext_aks_agent/_consts.py index 5355b049263..38d12270fce 100644 --- a/src/aks-agent/azext_aks_agent/_consts.py +++ b/src/aks-agent/azext_aks_agent/_consts.py @@ -8,3 +8,11 @@ CONST_AGENT_NAME = "AKS AGENT" CONST_AGENT_NAME_ENV_KEY = "AGENT_NAME" CONST_AGENT_CONFIG_FILE_NAME = "aksAgent.yaml" + +# MCP Integration Constants (ported from previous change) +CONST_MCP_BINARY_NAME = "aks-mcp" +CONST_MCP_DEFAULT_PORT = 8003 +CONST_MCP_DEFAULT_URL = "http://localhost:8003/sse" +CONST_MCP_MIN_VERSION = "0.0.7" +CONST_MCP_GITHUB_REPO = "Azure/aks-mcp" +CONST_MCP_BINARY_DIR = "bin" diff --git a/src/aks-agent/azext_aks_agent/_help.py b/src/aks-agent/azext_aks_agent/_help.py index d97a1ccedc9..d3b4dc86dd0 100644 --- a/src/aks-agent/azext_aks_agent/_help.py +++ b/src/aks-agent/azext_aks_agent/_help.py @@ -48,6 +48,9 @@ - name: --refresh-toolsets type: bool short-summary: Refresh the toolsets status. + - name: --no-aks-mcp + type: bool + short-summary: Disable AKS MCP integration and use traditional toolsets. examples: - name: Ask about pod issues in the cluster with Azure OpenAI @@ -104,3 +107,15 @@ enabled: false ``` """ + +helps[ + "aks agent status" +] = """ + type: command + short-summary: Show AKS agent configuration and readiness (MCP binary, server, and mode). + long-summary: |- + Displays MCP binary availability and version, MCP server running/healthy state (if applicable), and overall mode. + examples: + - name: Show agent status + text: az aks agent status +""" diff --git a/src/aks-agent/azext_aks_agent/_params.py b/src/aks-agent/azext_aks_agent/_params.py index 89c2abc7f6f..31d91596852 100644 --- a/src/aks-agent/azext_aks_agent/_params.py +++ b/src/aks-agent/azext_aks_agent/_params.py @@ -77,3 +77,9 @@ def load_arguments(self, _): help="Refresh the toolsets status.", action="store_true", ) + c.argument( + "no_aks_mcp", + options_list=["--no-aks-mcp"], + help="Disable AKS MCP integration and use traditional toolsets.", + action="store_true", + ) diff --git a/src/aks-agent/azext_aks_agent/agent/agent.py b/src/aks-agent/azext_aks_agent/agent/agent.py index d456a93458a..784de7e8d62 100644 --- a/src/aks-agent/azext_aks_agent/agent/agent.py +++ b/src/aks-agent/azext_aks_agent/agent/agent.py @@ -5,10 +5,7 @@ import logging import os -import socket import sys -import uuid -from pathlib import Path from azext_aks_agent._consts import ( CONST_AGENT_CONFIG_PATH_DIR_ENV_KEY, @@ -19,8 +16,9 @@ from azure.cli.core.commands.client_factory import get_subscription_id from knack.util import CLIError -from .prompt import AKS_CONTEXT_PROMPT +from .prompt import AKS_CONTEXT_PROMPT_MCP, AKS_CONTEXT_PROMPT_TRADITIONAL from .telemetry import CLITelemetryClient +from .error_handler import MCPError # NOTE(mainred): holmes leverage the log handler RichHandler to provide colorful, readable and well-formatted logs @@ -43,6 +41,77 @@ def init_log(): return init_logging([]) +def _get_mode_state_file() -> str: + """Get the path to the mode state file.""" + config_dir = get_config_dir() + return os.path.join(config_dir, "aks_agent_mode_state") + + +def _get_last_mode() -> str: + """ + Get the last used mode from state file. + + :return: 'mcp' or 'traditional' or 'unknown' if no state exists + :rtype: str + """ + state_file = _get_mode_state_file() + try: + if os.path.exists(state_file): + with open(state_file, 'r') as f: + mode = f.read().strip() + return mode if mode in ['mcp', 'traditional'] else 'unknown' + except (IOError, OSError): + pass + return 'unknown' + + +def _save_current_mode(mode: str) -> None: + """ + Save the current mode to state file. + + :param mode: 'mcp' or 'traditional' + :type mode: str + """ + if mode not in ['mcp', 'traditional']: + return + + state_file = _get_mode_state_file() + try: + # Ensure config directory exists + os.makedirs(os.path.dirname(state_file), exist_ok=True) + with open(state_file, 'w') as f: + f.write(mode) + except (IOError, OSError): + # Silently ignore state saving errors to avoid breaking the agent + pass + + +def _should_refresh_toolsets(requested_mode: str, user_refresh_request: bool) -> bool: + """ + Determine if toolsets should be refreshed based on mode transition and user request. + + :param requested_mode: The mode being requested ('mcp' or 'traditional') + :type requested_mode: str + :param user_refresh_request: Whether user explicitly requested refresh + :type user_refresh_request: bool + :return: True if toolsets should be refreshed + :rtype: bool + """ + # Always honor explicit user request to refresh + if user_refresh_request: + return True + + # Check if we're switching modes + last_mode = _get_last_mode() + + # Refresh on first run (unknown state) or mode transition + if last_mode == 'unknown' or last_mode != requested_mode: + return True + + # Same mode as last time, no refresh needed + return False + + # pylint: disable=too-many-locals def aks_agent( cmd, @@ -57,6 +126,7 @@ def aks_agent( no_echo_request, show_tool_output, refresh_toolsets, + no_aks_mcp=False, ): """ Interact with the AKS agent using a prompt or piped input. @@ -77,34 +147,25 @@ def aks_agent( :type show_tool_output: bool :param refresh_toolsets: Refresh the toolsets status. :type refresh_toolsets: bool + :param no_aks_mcp: Disable AKS MCP integration and use traditional toolsets. + :type no_aks_mcp: bool """ - with CLITelemetryClient(): + with CLITelemetryClient(): if sys.version_info < (3, 10): raise CLIError( "Please upgrade the python version to 3.10 or above to use aks agent." ) - # reverse the value of the variables so that + # Initialize variables interactive = not no_interactive echo = not no_echo_request - console = init_log() + # Set environment variables for Holmes os.environ[CONST_AGENT_CONFIG_PATH_DIR_ENV_KEY] = get_config_dir() - # Holmes library allows the user to specify the agent name through environment variable - # before loading the library. - os.environ[CONST_AGENT_NAME_ENV_KEY] = CONST_AGENT_NAME - from holmes.config import Config - from holmes.core.prompt import build_initial_ask_messages - from holmes.interactive import run_interactive_loop - from holmes.plugins.destinations import DestinationType - from holmes.plugins.interfaces import Issue - from holmes.plugins.prompts import load_and_render_prompt - from holmes.utils.console.result import handle_result - # Detect and read piped input piped_data = None if not sys.stdin.isatty(): @@ -115,96 +176,533 @@ def aks_agent( ) interactive = False - expanded_config_file = Path(os.path.expanduser(config_file)) + # Determine MCP mode and smart refresh logic + use_aks_mcp = not no_aks_mcp + current_mode = "mcp" if use_aks_mcp else "traditional" + smart_refresh = _should_refresh_toolsets(current_mode, refresh_toolsets) - config = Config.load_from_file( - expanded_config_file, - model=model, - api_key=api_key, - max_steps=max_steps, - ) + if show_tool_output: + from .user_feedback import ProgressReporter + last_mode = _get_last_mode() + ProgressReporter.show_status_message( + f"Mode transition check: {last_mode} → {current_mode}, smart_refresh: {smart_refresh}", "info" + ) - ai = config.create_console_toolcalling_llm( - dal=None, - refresh_toolsets=refresh_toolsets, - ) - console.print( - "[bold yellow]This tool uses AI to generate responses and may not always be accurate.[bold yellow]" - ) + # MCP Lifecycle Manager + mcp_lifecycle = MCPLifecycleManager() + + try: + config = None + + if use_aks_mcp: + try: + config_params = { + 'config_file': config_file, + 'model': model, + 'api_key': api_key, + 'max_steps': max_steps, + 'verbose': show_tool_output + } + mcp_info = mcp_lifecycle.setup_mcp_sync(config_params) + config = mcp_info['config'] + + if show_tool_output: + from .user_feedback import ProgressReporter + ProgressReporter.show_status_message("MCP mode active - enhanced capabilities enabled", "info") + + except Exception as e: # pylint: disable=broad-exception-caught + # Fallback to traditional mode on any MCP setup failure + from .error_handler import AgentErrorHandler + mcp_error = AgentErrorHandler.handle_mcp_setup_error(e, "MCP initialization") + if show_tool_output: + console.print(f"[yellow]MCP setup failed, using traditional mode: {mcp_error.message}[/yellow]") + if mcp_error.suggestions: + console.print("[dim]Suggestions for next time:[/dim]") + for suggestion in mcp_error.suggestions[:3]: # Show only first 3 suggestions + console.print(f"[dim] • {suggestion}[/dim]") + use_aks_mcp = False + current_mode = "traditional" + + # Fallback to traditional mode if MCP setup failed or was disabled + if not config: + config = _setup_traditional_mode_sync(config_file, model, api_key, max_steps, show_tool_output) + if show_tool_output: + console.print("[yellow]Traditional mode active (MCP disabled)[/yellow]") + + # Save the current mode to state file for next run + _save_current_mode(current_mode) + + # Use smart refresh logic + effective_refresh_toolsets = smart_refresh + if show_tool_output: + from .user_feedback import ProgressReporter + ProgressReporter.show_status_message( + f"Toolset refresh: {effective_refresh_toolsets} (Mode: {current_mode})", "info" + ) - if not prompt and not interactive and not piped_data: - raise CLIError( - "Either the 'prompt' argument must be provided (unless using --interactive mode)." + # Create AI client once with proper refresh settings + ai = config.create_console_toolcalling_llm( + dal=None, + refresh_toolsets=effective_refresh_toolsets, ) - # Handle piped data - if piped_data: - if prompt: - # User provided both piped data and a prompt - prompt = f"Here's some piped output:\n\n{piped_data}\n\n{prompt}" + # Validate inputs + if not prompt and not interactive and not piped_data: + raise CLIError( + "Either the 'prompt' argument must be provided (unless using --interactive mode)." + ) + + # Handle piped data + if piped_data: + if prompt: + # User provided both piped data and a prompt + prompt = f"Here's some piped output:\n\n{piped_data}\n\n{prompt}" + else: + # Only piped data, no prompt - ask what to do with it + prompt = f"Here's some piped output:\n\n{piped_data}\n\nWhat can you tell me about this output?" + + # Phase 2: Holmes Execution (synchronous - no event loop conflicts) + is_mcp_mode = current_mode == "mcp" + if interactive: + _run_interactive_mode_sync(ai, cmd, resource_group_name, name, + prompt, console, show_tool_output, is_mcp_mode) else: - # Only piped data, no prompt - ask what to do with it - prompt = f"Here's some piped output:\n\n{piped_data}\n\nWhat can you tell me about this output?" + _run_noninteractive_mode_sync(ai, config, cmd, resource_group_name, name, + prompt, console, echo, show_tool_output, is_mcp_mode) - if echo and not interactive and prompt: - console.print("[bold yellow]User:[/bold yellow] " + prompt) + finally: + # Phase 3: MCP Cleanup (isolated async if needed) + mcp_lifecycle.cleanup_mcp_sync() - subscription_id = get_subscription_id(cmd.cli_ctx) - aks_template_context = { - "cluster_name": name, - "resource_group": resource_group_name, - "subscription_id": subscription_id, - } +def _initialize_mcp_manager(verbose: bool = False): + """ + Initialize MCP manager for the current session. - aks_context_prompt = load_and_render_prompt( - AKS_CONTEXT_PROMPT, aks_template_context + :param verbose: Enable verbose output + :return: Initialized MCP manager + :raises: Exception if initialization fails + """ + try: + from .mcp_manager import MCPManager + return MCPManager(verbose=verbose) + except ImportError as e: + raise MCPError( + f"MCP manager initialization failed: {str(e)}", + "MCP_IMPORT", + [ + "Ensure all required dependencies are installed", + "Try reinstalling the aks-preview extension", + "Use --no-aks-mcp flag to disable MCP integration" + ] ) - # Variables not exposed to the user. - # Adds a prompt for post processing. - post_processing_prompt = None - # File to append to prompt - - include_file = None - if interactive: - run_interactive_loop( - ai, - console, - prompt, - include_file, - post_processing_prompt, - show_tool_output=show_tool_output, - system_prompt_additions=aks_context_prompt, - ) - return - - messages = build_initial_ask_messages( - console, - prompt, - include_file, - ai.tool_executor, - config.get_runbook_catalog(), - system_prompt_additions=aks_context_prompt, + +async def _setup_mcp_mode(mcp_manager, config_file: str, model: str, api_key: str, + max_steps: int, verbose: bool = False): + """ + Setup MCP mode configuration and start server. + + :param mcp_manager: Initialized MCP manager + :param config_file: Path to configuration file + :param model: Model name + :param api_key: API key + :param max_steps: Maximum steps + :param verbose: Enable verbose output + :return: Enhanced Holmes configuration + :raises: Exception if MCP setup fails + """ + from pathlib import Path + import yaml + import tempfile + from holmes.config import Config + from .config_generator import ConfigurationGenerator + from .user_feedback import ProgressReporter + from .error_handler import AgentErrorHandler + + # Ensure binary is available (download if needed) + if not mcp_manager.is_binary_available() or not mcp_manager.validate_binary_version(): + if verbose: + ProgressReporter.show_status_message("Downloading MCP binary...", "info") + + # This will raise an exception if download fails + binary_status = await mcp_manager.binary_manager.ensure_binary() + if not binary_status.ready: + error_msg = binary_status.error_message or "Unknown error during binary setup" + raise AgentErrorHandler.handle_binary_error(Exception(error_msg), "setup") + + # Start MCP server + if verbose: + ProgressReporter.show_status_message("Starting MCP server...", "info") + + server_started = await mcp_manager.start_server() + if not server_started: + raise AgentErrorHandler.handle_server_error(Exception("Server startup failed"), "startup") + + # Get MCP server URL + server_url = mcp_manager.get_server_url() + if not server_url: + raise AgentErrorHandler.handle_server_error(Exception("Server URL unavailable after startup"), "configuration") + + # Load base configuration as dictionary + expanded_config_file = Path(os.path.expanduser(config_file)) + base_config_dict = {} + + if expanded_config_file.exists(): + try: + with open(expanded_config_file, 'r') as f: + base_config_dict = yaml.safe_load(f) or {} + except Exception as e: # pylint: disable=broad-exception-caught + if verbose: + ProgressReporter.show_status_message(f"Warning: Could not load config file: {e}", "warning") + # Continue with empty dict if config file cannot be loaded + base_config_dict = {} + + # Generate enhanced MCP config + mcp_config_dict = ConfigurationGenerator.generate_mcp_config(base_config_dict, server_url) + + # Create temporary config file with MCP settings + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as temp_file: + yaml.dump(mcp_config_dict, temp_file) + temp_config_path = temp_file.name + + try: + # Load config from temporary file using Holmes API + enhanced_config = Config.load_from_file( + Path(temp_config_path), + model=model, + api_key=api_key, + max_steps=max_steps, ) + return enhanced_config + finally: + # Clean up temporary file + try: + os.unlink(temp_config_path) + except OSError: + pass # Ignore cleanup errors - response = ai.call(messages) - messages = response.messages +def _run_async_operation(async_func, *args, **kwargs): + """ + Run async operation in isolated event loop to avoid nested loop conflicts. - issue = Issue( - id=str(uuid.uuid4()), - name=prompt, - source_type="holmes-ask", - raw={"prompt": prompt, "full_conversation": messages}, - source_instance_id=socket.gethostname(), - ) - handle_result( - response, - console, - DestinationType.CLI, - config, - issue, - show_tool_output, - False, + This helper safely executes async operations by: + 1. First attempting to use asyncio.run() (normal case) + 2. If that fails due to existing event loop, using concurrent execution + + :param async_func: Async function to execute + :param args: Positional arguments for the function + :param kwargs: Keyword arguments for the function + :return: Result of the async function execution + :raises: Exception if the async operation fails + """ + import asyncio + import concurrent.futures + + try: + # Normal case: no existing event loop + return asyncio.run(async_func(*args, **kwargs)) + except RuntimeError as e: + if "cannot be called from a running event loop" in str(e): + # Nested case: run in separate thread with new event loop + def run_in_thread(): + # Create new event loop in this thread + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + return loop.run_until_complete(async_func(*args, **kwargs)) + finally: + loop.close() + asyncio.set_event_loop(None) + + # Execute in thread pool to avoid blocking + with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: + future = executor.submit(run_in_thread) + return future.result(timeout=300) # 5 minute timeout + else: + # Re-raise other RuntimeErrors + raise + + +class MCPLifecycleManager: + """ + Manages MCP (Model Context Protocol) lifecycle with isolated async operations. + + This class decouples MCP lifecycle management (setup/cleanup) from main execution + to prevent asyncio event loop conflicts. It provides synchronous wrappers for + async operations that can be safely called from any context. + """ + + def __init__(self): + """Initialize MCP lifecycle manager.""" + self.mcp_manager = None + self.config = None + self._setup_params = None + + def setup_mcp_sync(self, config_params): + """ + Synchronous wrapper for MCP setup phase. + + This method handles: + - MCP manager initialization + - Binary download and validation + - MCP server startup + - Configuration generation + + :param config_params: Dictionary containing config parameters: + - config_file: Path to configuration file + - model: Model name + - api_key: API key + - max_steps: Maximum steps + - verbose: Enable verbose output + :type config_params: dict + :return: Dictionary with setup results including config and server info + :rtype: dict + :raises: Exception if MCP setup fails + """ + self._setup_params = config_params + return _run_async_operation(self._setup_mcp_async, config_params) + + def cleanup_mcp_sync(self): + """ + Synchronous wrapper for MCP cleanup phase. + + Gracefully shuts down MCP server and cleans up resources. + Safe to call even if setup failed or was never called. + """ + if self.mcp_manager: + try: + _run_async_operation(self._cleanup_mcp_async) + except Exception as e: # pylint: disable=broad-exception-caught + # Log cleanup errors but don't re-raise to avoid masking original errors + try: + from .user_feedback import ProgressReporter + ProgressReporter.show_status_message( + f"Warning: Error during MCP cleanup: {str(e)}", "warning" + ) + except Exception: # pylint: disable=broad-exception-caught + # Fallback to basic logging if ProgressReporter fails + print(f"Warning: Error during MCP cleanup: {str(e)}") + + async def _setup_mcp_async(self, config_params): + """ + Async MCP setup: binary download, server start, config generation. + + :param config_params: Configuration parameters dictionary + :return: Setup results including config and server information + :raises: Exception if any setup step fails + """ + verbose = config_params.get('verbose', False) + + # Initialize MCP manager + self.mcp_manager = _initialize_mcp_manager(verbose) + + # Setup MCP mode using existing function + self.config = await _setup_mcp_mode( + self.mcp_manager, + config_params['config_file'], + config_params['model'], + config_params['api_key'], + config_params['max_steps'], + verbose ) + + # Return setup results + return { + 'config': self.config, + 'server_url': self.mcp_manager.get_server_url(), + 'server_port': self.mcp_manager.get_server_port(), + 'manager': self.mcp_manager + } + + async def _cleanup_mcp_async(self): + """ + Async MCP cleanup: graceful server shutdown. + """ + if self.mcp_manager: + self.mcp_manager.stop_server() # Now synchronous, no await needed + self.mcp_manager = None + self.config = None + self._setup_params = None + + def is_mcp_active(self): + """ + Check if MCP is currently active. + + :return: True if MCP manager is initialized and server is running + :rtype: bool + """ + return (self.mcp_manager is not None and + self.mcp_manager.is_server_running()) + + def get_config(self): + """ + Get the current configuration object. + + :return: Holmes configuration object if setup succeeded, None otherwise + :rtype: object or None + """ + return self.config + + +def _build_aks_context(cluster_name, resource_group_name, subscription_id, is_mcp_mode): + """ + Build AKS context prompt for the AI agent. + + :param cluster_name: AKS cluster name + :param resource_group_name: Resource group name + :param subscription_id: Azure subscription ID + :param is_mcp_mode: Whether running in MCP mode (affects toolset instructions) + :return: Rendered AKS context prompt + """ + from holmes.plugins.prompts import load_and_render_prompt + + aks_template_context = { + "cluster_name": cluster_name, + "resource_group": resource_group_name, + "subscription_id": subscription_id, + "is_mcp_mode": is_mcp_mode, + } + + # Select the appropriate prompt template based on mode + prompt_template = AKS_CONTEXT_PROMPT_MCP if is_mcp_mode else AKS_CONTEXT_PROMPT_TRADITIONAL + + return load_and_render_prompt(prompt_template, aks_template_context) + + +def _run_interactive_mode_sync(ai, cmd, resource_group_name, name, + prompt, console, show_tool_output, is_mcp_mode): + """ + Run interactive mode synchronously - no event loop conflicts. + + This function runs Holmes interactive loop without any async context, + preventing the nested event loop conflicts that were causing failures. + + :param ai: Holmes AI client (pre-configured with toolsets) + :param cmd: Azure CLI command context + :param resource_group_name: AKS resource group name + :param name: AKS cluster name + :param prompt: Initial prompt (optional) + :param console: Console object for output + :param show_tool_output: Whether to show tool output + :param is_mcp_mode: Whether running in MCP mode (affects prompt selection) + """ + from holmes.interactive import run_interactive_loop + + # Prepare AKS context with mode-specific prompt + subscription_id = get_subscription_id(cmd.cli_ctx) + aks_context = _build_aks_context(name, resource_group_name, subscription_id, is_mcp_mode) + + console.print( + "[bold yellow]This tool uses AI to generate responses and may not always be accurate.[bold yellow]" + ) + + # Holmes interactive loop - runs synchronously ✅ + run_interactive_loop( + ai, console, prompt, None, None, + show_tool_output=show_tool_output, + system_prompt_additions=aks_context + ) + + +def _run_noninteractive_mode_sync(ai, config, cmd, resource_group_name, name, + prompt, console, echo, show_tool_output, is_mcp_mode): + """ + Run non-interactive mode synchronously. + + :param ai: Holmes AI client (pre-configured with toolsets) + :param config: Holmes configuration object (for runbook catalog) + :param cmd: Azure CLI command context + :param resource_group_name: AKS resource group name + :param name: AKS cluster name + :param prompt: User prompt + :param console: Console object for output + :param echo: Whether to echo prompts + :param show_tool_output: Whether to show tool output + :param is_mcp_mode: Whether running in MCP mode (affects prompt selection) + """ + import uuid + import socket + from holmes.core.prompt import build_initial_ask_messages + from holmes.plugins.destinations import DestinationType + from holmes.plugins.interfaces import Issue + from holmes.utils.console.result import handle_result + + # Prepare AKS context with mode-specific prompt + subscription_id = get_subscription_id(cmd.cli_ctx) + aks_context = _build_aks_context(name, resource_group_name, subscription_id, is_mcp_mode) + + console.print( + "[bold yellow]This tool uses AI to generate responses and may not always be accurate.[bold yellow]" + ) + + if echo and prompt: + console.print("[bold yellow]User:[/bold yellow] " + prompt) + + # Build and execute the conversation + messages = build_initial_ask_messages( + console, prompt, None, ai.tool_executor, + config.get_runbook_catalog(), system_prompt_additions=aks_context + ) + + response = ai.call(messages) + + # Handle the result + issue = Issue( + id=str(uuid.uuid4()), name=prompt, source_type="holmes-ask", + raw={"prompt": prompt, "full_conversation": response.messages}, + source_instance_id=socket.gethostname() + ) + + handle_result(response, console, DestinationType.CLI, config, + issue, show_tool_output, False) + + +def _setup_traditional_mode_sync(config_file: str, model: str, api_key: str, + max_steps: int, verbose: bool = False): + """ + Synchronous wrapper for traditional mode setup. + + :param config_file: Path to configuration file + :param model: Model name + :param api_key: API key + :param max_steps: Maximum steps + :param verbose: Enable verbose output + :return: Traditional Holmes configuration + """ + from pathlib import Path + import yaml + import tempfile + from holmes.config import Config + from .config_generator import ConfigurationGenerator + + # Load base config + expanded_config_file = Path(os.path.expanduser(config_file)) + base_config_dict = {} + + if expanded_config_file.exists(): + try: + with open(expanded_config_file, 'r') as f: + base_config_dict = yaml.safe_load(f) or {} + except Exception as e: # pylint: disable=broad-exception-caught + if verbose: + print(f"Warning: Could not load config file: {e}") + base_config_dict = {} + + # Generate traditional config + traditional_config_dict = ConfigurationGenerator.generate_traditional_config(base_config_dict) + + # Create temporary config and load + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as temp_file: + yaml.dump(traditional_config_dict, temp_file) + temp_config_path = temp_file.name + + try: + return Config.load_from_file(Path(temp_config_path), model=model, + api_key=api_key, max_steps=max_steps) + finally: + try: + os.unlink(temp_config_path) + except OSError: + pass diff --git a/src/aks-agent/azext_aks_agent/agent/binary_manager.py b/src/aks-agent/azext_aks_agent/agent/binary_manager.py new file mode 100644 index 00000000000..02ac4d7b93e --- /dev/null +++ b/src/aks-agent/azext_aks_agent/agent/binary_manager.py @@ -0,0 +1,517 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +""" +Binary management module for AKS MCP integration. + +This module handles the download, validation, and management of the aks-mcp binary +required for Model Context Protocol integration with the AKS agent. +""" + +import os +import platform +import subprocess +import stat +from typing import Optional + +from .._consts import ( + CONST_MCP_BINARY_NAME, + CONST_MCP_MIN_VERSION, + CONST_MCP_GITHUB_REPO, +) +from .error_handler import BinaryError +from .status_models import BinaryStatus + + +class AksMcpBinaryManager: + """Manages aks-mcp binary download and validation.""" + + GITHUB_RELEASES_URL = f"https://api.github.com/repos/{CONST_MCP_GITHUB_REPO}/releases" + + def __init__(self, install_dir: str): + """ + Initialize the binary manager. + + :param install_dir: Directory where the binary should be installed + :type install_dir: str + """ + self.install_dir = install_dir + self.binary_path = self._get_binary_path() + + def _get_binary_path(self) -> str: + """Get the expected path for the binary.""" + binary_name = CONST_MCP_BINARY_NAME + if platform.system() == "Windows": + binary_name += ".exe" + return os.path.join(self.install_dir, binary_name) + + def get_binary_path(self) -> str: + """Get expected binary path.""" + return self.binary_path + + def is_binary_available(self) -> bool: + """ + Check if binary exists and is executable. + + :return: True if binary is available and executable, False otherwise + :rtype: bool + """ + if not os.path.exists(self.binary_path): + return False + + # Check if file is executable + try: + return os.access(self.binary_path, os.X_OK) + except OSError: + return False + + def get_binary_version(self) -> Optional[str]: + """ + Get binary version if available. + + :return: Version string if available, None otherwise + :rtype: Optional[str] + """ + if not self.is_binary_available(): + return None + + try: + # Run the binary with --version flag + result = subprocess.run( + [self.binary_path, "--version"], + capture_output=True, + text=True, + timeout=10, + check=False + ) + + if result.returncode == 0: + output = result.stdout.strip() + # Parse aks-mcp version output format: + # "aks-mcp version v0.0.6-16-ga7464eb+2025-08-18T13:33:38Z" + import re + + # First try to match git-style version format: v0.0.6-16-ga7464eb + git_version_match = re.search(r'v?(\d+\.\d+\.\d+)(?:-\d+-g[a-f0-9]+)?(?:\+[^\s]+)?', output) + if git_version_match: + return git_version_match.group(1) + + # Fallback to simple semantic version pattern + version_match = re.search(r'(\d+\.\d+\.\d+(?:\.\d+)?)', output) + if version_match: + return version_match.group(1) + + return None + + except (subprocess.TimeoutExpired, subprocess.SubprocessError, OSError): + return None + + def validate_version(self, required_version: str = CONST_MCP_MIN_VERSION) -> bool: + """ + Validate binary meets version requirements. + + :param required_version: Minimum required version (default from constants) + :type required_version: str + :return: True if version meets requirements, False otherwise + :rtype: bool + """ + current_version = self.get_binary_version() + if not current_version: + return False + + try: + # Parse version strings for comparison + def parse_version(version_str): + return tuple(map(int, version_str.split('.'))) + + current_parsed = parse_version(current_version) + required_parsed = parse_version(required_version) + + return current_parsed >= required_parsed + + except (ValueError, TypeError): + # If version parsing fails, assume invalid + return False + + def _create_installation_directory(self) -> bool: + """ + Create installation directory if needed. + + :return: True if directory exists or was created successfully, False otherwise + :rtype: bool + """ + try: + os.makedirs(self.install_dir, exist_ok=True) + return True + except (OSError, PermissionError): + return False + + async def ensure_binary(self, progress_callback: callable = None) -> BinaryStatus: + """ + Complete binary availability check and auto-download. + + This is the main entry point for binary management. It checks if the binary + is available and valid, and automatically downloads it if needed. + + :param progress_callback: Optional callback for download progress updates + :type progress_callback: callable + :return: Binary status information + :rtype: BinaryStatus + """ + try: + # Check if binary already exists and is valid using enhanced status + if self.is_binary_available(): + version = self.get_binary_version() + version_valid = self.validate_version() + status = BinaryStatus.from_file_path( + self.binary_path, + version=version, + version_valid=version_valid + ) + + if status.ready: + # Binary is ready to use + return status + else: + # Binary not available, create basic status + status = BinaryStatus(path=self.binary_path) + + # Binary is missing or invalid, attempt to download + if not self._create_installation_directory(): + status.error_message = f"Failed to create installation directory: {self.install_dir}" + return status + + # Download the binary + download_success = await self.download_binary(progress_callback=progress_callback) + + if download_success: + # Verify the downloaded binary using enhanced status + version = self.get_binary_version() + version_valid = self.validate_version() + + if os.path.exists(self.binary_path): + status = BinaryStatus.from_file_path( + self.binary_path, + version=version, + version_valid=version_valid + ) + else: + status = BinaryStatus( + available=True, + version=version, + path=self.binary_path, + version_valid=version_valid + ) + + if not status.ready and version is not None: + # Binary present but version invalid + status.error_message = "Downloaded binary failed validation" + else: + status.error_message = "Binary download failed" + + except Exception as e: # pylint: disable=broad-exception-caught + status = BinaryStatus( + path=self.binary_path, + error_message=f"Unexpected error during binary management: {str(e)}" + ) + + return status + + def _get_platform_info(self) -> tuple[str, str]: + """ + Get platform and architecture information for binary selection. + + :return: Tuple of (platform, architecture) + :rtype: tuple[str, str] + """ + system = platform.system().lower() + machine = platform.machine().lower() + + # Map platform names + platform_map = { + "windows": "windows", + "darwin": "darwin", + "linux": "linux" + } + + # Map architecture names + arch_map = { + "x86_64": "amd64", + "amd64": "amd64", + "arm64": "arm64", + "aarch64": "arm64" + } + + platform_name = platform_map.get(system, system) + arch_name = arch_map.get(machine, machine) + + return platform_name, arch_name + + def _make_binary_executable(self, binary_path: str) -> bool: + """ + Set executable permissions on the binary (Unix-like systems). + + :param binary_path: Path to the binary file + :type binary_path: str + :return: True if successful, False otherwise + :rtype: bool + """ + try: + if platform.system() != "Windows": + # Set executable permissions (owner, group, others can execute) + current_mode = os.stat(binary_path).st_mode + os.chmod(binary_path, current_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH) + return True + except (OSError, IOError): + return False + + async def get_latest_release_info(self) -> dict: + """ + Get latest release information from GitHub API. + + :return: Release information dictionary + :rtype: dict + :raises: Exception if API request fails + """ + import json + import aiohttp + + try: + async with aiohttp.ClientSession() as session: + async with session.get( + f"{self.GITHUB_RELEASES_URL}/latest", + timeout=aiohttp.ClientTimeout(total=30) + ) as response: + if response.status == 200: + return await response.json() + raise RuntimeError(f"GitHub API request failed with status {response.status}") + + except aiohttp.ClientError as e: + raise RuntimeError(f"Network error accessing GitHub API: {e}") from e + except json.JSONDecodeError as e: + raise RuntimeError(f"Failed to parse GitHub API response: {e}") from e + + def _get_platform_binary_name(self) -> str: + """ + Get platform-specific binary name from GitHub release assets. + + :return: Binary name pattern for the current platform + :rtype: str + """ + platform_name, arch_name = self._get_platform_info() + + # GitHub release asset naming convention for aks-mcp (actual format) + # Examples: aks-mcp-linux-amd64, aks-mcp-windows-amd64.exe, aks-mcp-darwin-arm64 + binary_name = f"aks-mcp-{platform_name}-{arch_name}" + + if platform.system() == "Windows": + binary_name += ".exe" + + return binary_name + + def _verify_binary_integrity(self, file_path: str, release_info: dict = None) -> bool: # pylint: disable=too-many-locals,too-many-return-statements,too-many-nested-blocks + """ + Verify downloaded binary integrity using in-toto attestation files. + + :param file_path: Path to the downloaded binary + :type file_path: str + :param release_info: Release information from GitHub API (optional) + :type release_info: dict + :return: True if binary integrity is verified, False otherwise + :rtype: bool + """ + try: + # Basic file existence check + if not os.path.exists(file_path): + return False + + # If no release info provided, use basic verification + if not release_info: + return True + + # Find the corresponding .intoto.jsonl file + binary_filename = os.path.basename(file_path) + intoto_filename = f"{binary_filename}.intoto.jsonl" + intoto_url = None + + for asset in release_info.get("assets", []): + if asset["name"] == intoto_filename: + intoto_url = asset["browser_download_url"] + break + + if not intoto_url: + # No attestation file found, fall back to basic verification + return True + + # Try to download and verify attestation synchronously + # For production, we could enhance this with proper async HTTP client + # For now, use basic verification as fallback + try: + import urllib.request + import json + import hashlib + + # Download attestation file + with urllib.request.urlopen(intoto_url, timeout=30) as response: + if response.status != 200: + return True # Fall back to basic verification + attestation_content = response.read().decode('utf-8') + + # Parse the in-toto attestation (JSONL format - one JSON object per line) + for line in attestation_content.strip().split('\n'): # pylint: disable=too-many-nested-blocks + if line.strip(): + try: + attestation = json.loads(line) + + # Look for the subject information containing the binary hash + predicate = attestation.get("predicate", {}) + materials = predicate.get("materials", {}) + + # Try to find hash in different possible locations in the attestation + binary_hash = None + + # Check for direct subject hash + if "subject" in attestation and isinstance(attestation["subject"], list): + for subject in attestation["subject"]: + if subject.get("name") == binary_filename: + digest = subject.get("digest", {}) + binary_hash = digest.get("sha256") + break + + # Fallback: Check materials for hash entries + if not binary_hash and isinstance(materials, list): + for material in materials: + if material.get("uri", "").endswith(binary_filename): + digest = material.get("digest", {}) + binary_hash = digest.get("sha256") + break + + if not binary_hash: + continue + + # Compute SHA-256 of the binary file + sha256_hash = hashlib.sha256() + with open(file_path, 'rb') as f: + while True: + chunk = f.read(8192) + if not chunk: + break + sha256_hash.update(chunk) + calculated_hash = sha256_hash.hexdigest() + + # Verify the hash matches + return calculated_hash == binary_hash + + except (json.JSONDecodeError, KeyError): + continue + + # If we couldn't parse the attestation or find hash, fall back to basic verification + return True + + except Exception: # pylint: disable=broad-exception-caught + # If attestation verification fails, fall back to basic verification + return True + + except Exception: # pylint: disable=broad-exception-caught + # If any error occurs, fall back to basic file existence check for reliability + try: + return os.path.exists(file_path) + except OSError: + return False + + async def download_binary( + self, + progress_callback: callable = None + ) -> bool: + """ + Download binary from GitHub releases. + + :param progress_callback: Callback function for progress updates (downloaded, total, filename) + :type progress_callback: callable + :return: True if download successful, False otherwise + :rtype: bool + """ + import aiohttp + + try: + # Get release information + release_info = await self.get_latest_release_info() + + # Find the appropriate asset for current platform + platform_binary_name = self._get_platform_binary_name() + download_url = None + + for asset in release_info.get("assets", []): + if asset["name"] == platform_binary_name: + download_url = asset["browser_download_url"] + break + + if not download_url: + raise BinaryError( + f"No binary available for your platform ({platform_binary_name})", + "PLATFORM_UNSUPPORTED", + [ + "Check if your platform is supported by the aks-mcp project", + "Try using --no-aks-mcp to disable MCP integration", + f"Manually install the binary from {self.GITHUB_RELEASES_URL} if available" + ] + ) + + # Create installation directory if it doesn't exist + if not self._create_installation_directory(): + raise BinaryError( + f"Cannot create installation directory: {self.install_dir}", + "DIRECTORY_CREATION_FAILED", + [ + "Check if you have write permissions to the Azure CLI config directory", + "Ensure sufficient disk space is available", + "Try running with elevated permissions if necessary" + ] + ) + + # Download the binary + async with aiohttp.ClientSession() as session: + async with session.get( + download_url, + timeout=aiohttp.ClientTimeout(total=300) # 5 minutes timeout + ) as response: + if response.status != 200: + raise RuntimeError(f"Download failed with status {response.status}") + + total_size = int(response.headers.get('content-length', 0)) + downloaded = 0 + + # Download with progress tracking + with open(self.binary_path, 'wb') as f: + async for chunk in response.content.iter_chunked(8192): + f.write(chunk) + downloaded += len(chunk) + + if progress_callback and total_size > 0: + progress_callback(downloaded, total_size, platform_binary_name) + + # Verify download integrity using in-toto attestation (now synchronous) + if not self._verify_binary_integrity(self.binary_path, release_info): + # Clean up invalid binary + try: + os.remove(self.binary_path) + except OSError: + pass + raise RuntimeError("Downloaded binary failed integrity check") + + # Make binary executable + if not self._make_binary_executable(self.binary_path): + raise RuntimeError("Failed to make binary executable") + + return True + + except Exception as e: + # Clean up partial download on failure + try: + if os.path.exists(self.binary_path): + os.remove(self.binary_path) + except OSError: + pass + raise e diff --git a/src/aks-agent/azext_aks_agent/agent/config_generator.py b/src/aks-agent/azext_aks_agent/agent/config_generator.py new file mode 100644 index 00000000000..71d29134155 --- /dev/null +++ b/src/aks-agent/azext_aks_agent/agent/config_generator.py @@ -0,0 +1,174 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +""" +Configuration generation for AKS Agent with MCP integration support. +""" + +from typing import Dict, Any +import copy + + +class ConfigurationGenerator: + """Generate Holmes configuration for different modes (MCP vs Traditional).""" + + # Default built-in toolsets that conflict with MCP + DEFAULT_CONFLICTING_TOOLSETS = { + "aks/node-health": {"enabled": True}, + "aks/core": {"enabled": True}, + "kubernetes/core": {"enabled": True}, + "kubernetes/logs": {"enabled": True}, + "kubernetes/live-metrics": {"enabled": True}, + "bash": {"enabled": True} + } + + @staticmethod + def generate_mcp_config(base_config: Dict[str, Any], server_url: str) -> Dict[str, Any]: + """ + Generate MCP mode configuration. + + :param base_config: Base Holmes configuration dictionary + :param server_url: MCP server URL (e.g., "http://localhost:8003/sse") + :return: Enhanced configuration with MCP server integration + """ + if not base_config: + base_config = {} + + if not server_url: + raise ValueError("server_url is required for MCP configuration") + + # Deep copy to avoid modifying the original + config = copy.deepcopy(base_config) + + # Disable conflicting built-in toolsets + toolsets = config.get("toolsets", {}) + for toolset_name in ConfigurationGenerator.DEFAULT_CONFLICTING_TOOLSETS: + toolsets[toolset_name] = {"enabled": False} + + config["toolsets"] = toolsets + + # Add MCP server configuration + mcp_servers = config.get("mcp_servers", {}) + mcp_servers["aks-mcp"] = { + "description": "AKS MCP server", + "url": server_url + } + config["mcp_servers"] = mcp_servers + + return config + + @staticmethod + def generate_traditional_config(base_config: Dict[str, Any]) -> Dict[str, Any]: + """ + Generate traditional mode configuration. + + :param base_config: Base Holmes configuration dictionary + :return: Configuration with traditional built-in toolsets enabled + """ + if not base_config: + base_config = {} + + # Deep copy to avoid modifying the original + config = copy.deepcopy(base_config) + + # Ensure all built-in toolsets are enabled (default behavior) + toolsets = config.get("toolsets", {}) + + for toolset_name, toolset_config in ConfigurationGenerator.DEFAULT_CONFLICTING_TOOLSETS.items(): + toolsets[toolset_name] = copy.deepcopy(toolset_config) + + config["toolsets"] = toolsets + + # Remove any MCP server configurations if they exist + if "mcp_servers" in config: + del config["mcp_servers"] + + return config + + @staticmethod + def merge_configs(base: Dict[str, Any], override: Dict[str, Any]) -> Dict[str, Any]: + """ + Merge configuration dictionaries with deep merge for nested structures. + + :param base: Base configuration dictionary + :param override: Override configuration dictionary + :return: Merged configuration with override values taking precedence + """ + if not base: + return copy.deepcopy(override) if override else {} + + if not override: + return copy.deepcopy(base) + + result = copy.deepcopy(base) + + for key, value in override.items(): + if key in result and isinstance(result[key], dict) and isinstance(value, dict): + # Deep merge nested dictionaries + result[key] = ConfigurationGenerator.merge_configs(result[key], value) + else: + # Override value + result[key] = copy.deepcopy(value) + + return result + + @staticmethod + def validate_mcp_config(config: Dict[str, Any]) -> bool: + """ + Validate that MCP configuration is properly structured. + + :param config: Configuration dictionary to validate + :return: True if valid MCP configuration + """ + if not isinstance(config, dict): + return False + + # Check MCP servers exist + mcp_servers = config.get("mcp_servers") + if not isinstance(mcp_servers, dict): + return False + + # Check aks-mcp server configuration + aks_mcp = mcp_servers.get("aks-mcp") + if not isinstance(aks_mcp, dict): + return False + + required_fields = ["description", "url"] + for field in required_fields: + if field not in aks_mcp: + return False + + # Check conflicting toolsets are disabled + toolsets = config.get("toolsets", {}) + for toolset_name in ConfigurationGenerator.DEFAULT_CONFLICTING_TOOLSETS: + toolset_config = toolsets.get(toolset_name, {}) + if toolset_config.get("enabled", True): # Default enabled if not specified + return False + + return True + + @staticmethod + def validate_traditional_config(config: Dict[str, Any]) -> bool: + """ + Validate that traditional configuration is properly structured. + + :param config: Configuration dictionary to validate + :return: True if valid traditional configuration + """ + if not isinstance(config, dict): + return False + + # Check that MCP servers are not configured + if "mcp_servers" in config: + return False + + # Check that traditional toolsets are enabled + toolsets = config.get("toolsets", {}) + for toolset_name in ConfigurationGenerator.DEFAULT_CONFLICTING_TOOLSETS: + toolset_config = toolsets.get(toolset_name, {}) + if not toolset_config.get("enabled", False): + return False + + return True diff --git a/src/aks-agent/azext_aks_agent/agent/error_handler.py b/src/aks-agent/azext_aks_agent/agent/error_handler.py new file mode 100644 index 00000000000..606231e8722 --- /dev/null +++ b/src/aks-agent/azext_aks_agent/agent/error_handler.py @@ -0,0 +1,255 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +""" +Centralized error handling and user-friendly message generation for AKS Agent. +Provides consistent error formatting and actionable guidance for common failure scenarios. +""" + +from typing import Dict, Any, List +from knack.util import CLIError + + +class AgentError(Exception): + """Base exception for AKS Agent errors with enhanced user messaging.""" + + def __init__(self, message: str, error_code: str = None, suggestions: List[str] = None): + """ + Initialize agent error with user-friendly message and suggestions. + + :param message: Primary error message + :param error_code: Unique error code for debugging + :param suggestions: List of actionable suggestions for the user + """ + super().__init__(message) + self.message = message + self.error_code = error_code or "GENERAL" + self.suggestions = suggestions or [] + + +class MCPError(AgentError): + """MCP-specific errors with fallback guidance.""" + + def __init__(self, message: str, error_code: str = None, suggestions: List[str] = None): + default_suggestions = [ + "Try running with --no-aks-mcp to use traditional mode", + "Check your internet connection for MCP binary download", + "Verify that port 8003 is available for MCP server" + ] + combined_suggestions = (suggestions or []) + default_suggestions + super().__init__(message, error_code or "MCP", combined_suggestions) + + +class BinaryError(MCPError): + """Binary download and management errors.""" + + def __init__(self, message: str, error_code: str = None, suggestions: List[str] = None): + default_suggestions = [ + "Ensure you have internet connectivity to download the MCP binary", + "Check if your firewall allows connections to GitHub releases", + "Try running the command again - downloads may be temporarily unavailable" + ] + combined_suggestions = (suggestions or []) + default_suggestions + super().__init__(message, error_code or "BINARY", combined_suggestions) + + +class ServerError(MCPError): + """MCP server lifecycle and communication errors.""" + + def __init__(self, message: str, error_code: str = None, suggestions: List[str] = None): + default_suggestions = [ + "Check if another process is using the MCP server port", + "Ensure the MCP binary has execute permissions", + "Try restarting the agent command" + ] + combined_suggestions = (suggestions or []) + default_suggestions + super().__init__(message, error_code or "SERVER", combined_suggestions) + + +class ConfigurationError(AgentError): + """Configuration generation and validation errors.""" + + def __init__(self, message: str, error_code: str = None, suggestions: List[str] = None): + default_suggestions = [ + "Check that your configuration file is valid YAML", + "Verify all required configuration fields are present", + "Try removing custom configuration to use defaults" + ] + combined_suggestions = (suggestions or []) + default_suggestions + super().__init__(message, error_code or "CONFIG", combined_suggestions) + + +class AgentErrorHandler: + """Centralized error handling and message formatting for AKS Agent.""" + + @staticmethod + def format_error_message(error: Exception, show_suggestions: bool = True) -> str: + """ + Format error message with consistent styling and suggestions. + + :param error: Exception to format + :param show_suggestions: Whether to include suggestions in output + :return: Formatted error message + """ + if isinstance(error, AgentError): + message = f"AKS Agent Error ({error.error_code}): {error.message}" + + if show_suggestions and error.suggestions: + message += "\n\nSuggestions:" + for i, suggestion in enumerate(error.suggestions, 1): + message += f"\n {i}. {suggestion}" + + return message + + # Handle non-AgentError exceptions + return f"AKS Agent Error: {str(error)}" + + @staticmethod + def create_cli_error(agent_error: AgentError, show_suggestions: bool = True) -> CLIError: + """ + Convert AgentError to CLIError with formatted message. + + :param agent_error: AgentError instance + :param show_suggestions: Whether to include suggestions + :return: CLIError with formatted message + """ + formatted_message = AgentErrorHandler.format_error_message(agent_error, show_suggestions) + return CLIError(formatted_message) + + @staticmethod + def handle_mcp_setup_error(original_error: Exception, context: str = "") -> MCPError: + """ + Handle MCP setup errors with contextual suggestions. + + :param original_error: Original exception that occurred + :param context: Context of the operation (e.g., "initialization") + :return: Enhanced MCPError with specific guidance + """ + error_message = "MCP setup failed" + if context: + error_message += f" during {context}" + error_message += f": {str(original_error)}" + + suggestions = [] + error_str = str(original_error).lower() + + # Provide specific suggestions based on error content + if "network" in error_str or "connection" in error_str: + suggestions.extend([ + "Check your internet connection", + "Verify firewall settings allow GitHub access", + "Try again after a few minutes if GitHub is temporarily unavailable" + ]) + elif "permission" in error_str or "access" in error_str: + suggestions.extend([ + "Check file system permissions in your Azure CLI config directory", + "Ensure you have write access to download the MCP binary", + "Try running with elevated permissions if necessary" + ]) + elif "port" in error_str or "address" in error_str: + suggestions.extend([ + "Check if port 8003 is already in use by another application", + "Close other applications that might be using network ports", + "Restart your terminal/command prompt and try again" + ]) + + return MCPError(error_message, "MCP_SETUP", suggestions) + + @staticmethod + def handle_binary_error(original_error: Exception, operation: str) -> BinaryError: + """ + Handle binary-related errors with operation-specific guidance. + + :param original_error: Original exception that occurred + :param operation: Operation being performed (e.g., "download", "validation") + :return: Enhanced BinaryError with specific guidance + """ + error_message = f"Binary {operation} failed: {str(original_error)}" + + suggestions = [] + + if operation == "download": + suggestions.extend([ + "Verify you have internet connectivity", + "Check if GitHub.com is accessible from your network", + "Try using a VPN if you're behind a corporate firewall" + ]) + elif operation == "validation": + suggestions.extend([ + "The downloaded binary may be corrupted - try downloading again", + "Check if your antivirus software is interfering with the binary", + "Ensure you have sufficient disk space for the binary" + ]) + elif operation == "execution": + suggestions.extend([ + "Check if the binary has execute permissions", + "Verify the binary is compatible with your platform", + "Try downloading a fresh copy of the binary" + ]) + + return BinaryError(error_message, f"BINARY_{operation.upper()}", suggestions) + + @staticmethod + def handle_server_error(original_error: Exception, operation: str) -> ServerError: + """ + Handle server-related errors with operation-specific guidance. + + :param original_error: Original exception that occurred + :param operation: Operation being performed (e.g., "startup", "health_check") + :return: Enhanced ServerError with specific guidance + """ + error_message = f"MCP server {operation} failed: {str(original_error)}" + + suggestions = [] + + if operation == "startup": + suggestions.extend([ + "Check if the MCP binary is available and executable", + "Verify no other process is using the MCP server port", + "Check system resources (memory, CPU) availability" + ]) + elif operation == "health_check": + suggestions.extend([ + "The MCP server may have crashed - it will be automatically restarted", + "Check system logs for any server error messages", + "Try restarting the agent command if issues persist" + ]) + elif operation == "communication": + suggestions.extend([ + "Check if the MCP server is still running", + "Verify network connectivity to the server port", + "Try restarting the agent to reinitialize the server" + ]) + + return ServerError(error_message, f"SERVER_{operation.upper()}", suggestions) + + @staticmethod + def create_context_error(context_info: Dict[str, Any]) -> AgentError: + """ + Create user-friendly error for AKS context validation failures. + + :param context_info: Dictionary with detected context information + :return: AgentError with context validation guidance + """ + cluster_name = context_info.get("cluster_name", "None") + resource_group = context_info.get("resource_group", "None") + subscription_id = context_info.get("subscription_id", "None") + + message = "AKS cluster context validation failed.\n\nDetected context:" + message += f"\n- Cluster name: {cluster_name}" + message += f"\n- Resource group: {resource_group}" + message += f"\n- Subscription ID: {subscription_id}" + + suggestions = [ + "Provide the cluster context in your prompt (cluster name, resource group, subscription ID)", + ( + "Restart with explicit flags: --name --resource-group " + "--subscription " + ), + "Ensure you're logged into the correct Azure account with 'az login'", + "Verify the AKS cluster exists and you have access to it", + ] + + return AgentError(message, "CONTEXT_VALIDATION", suggestions) diff --git a/src/aks-agent/azext_aks_agent/agent/mcp_manager.py b/src/aks-agent/azext_aks_agent/agent/mcp_manager.py new file mode 100644 index 00000000000..e641c313ca4 --- /dev/null +++ b/src/aks-agent/azext_aks_agent/agent/mcp_manager.py @@ -0,0 +1,335 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +""" +MCP Manager module for AKS MCP integration. + +This module provides the main interface for managing the AKS Model Context Protocol +server lifecycle and integration with the AKS agent. +""" + +import os +import asyncio +from typing import Optional +from azure.cli.core.api import get_config_dir + +from .binary_manager import AksMcpBinaryManager +from .._consts import CONST_MCP_BINARY_DIR +from .error_handler import ServerError + + +class MCPManager: + """MCP lifecycle management with server process control""" + + def __init__(self, config_dir: str = None, verbose: bool = False): + """ + Initialize MCP manager. + + :param config_dir: Configuration directory path (defaults to Azure CLI config dir) + :type config_dir: Optional[str] + :param verbose: Enable verbose output + :type verbose: bool + """ + self.config_dir = config_dir or get_config_dir() + self.binary_manager = AksMcpBinaryManager( + os.path.join(self.config_dir, CONST_MCP_BINARY_DIR) + ) + self.verbose = verbose + + # Server process management + self.server_process = None + self.server_url = None + self.server_port = None + + def is_binary_available(self) -> bool: + """ + Check if MCP binary is available. + + :return: True if binary is available, False otherwise + :rtype: bool + """ + return self.binary_manager.is_binary_available() + + def get_binary_version(self) -> Optional[str]: + """ + Get binary version if available. + + :return: Version string if available, None otherwise + :rtype: Optional[str] + """ + return self.binary_manager.get_binary_version() + + def get_binary_path(self) -> str: + """ + Get the path where the MCP binary should be located. + + :return: Path to the binary + :rtype: str + """ + return self.binary_manager.get_binary_path() + + def validate_binary_version(self) -> bool: + """ + Validate that the binary meets minimum version requirements. + + :return: True if binary meets requirements, False otherwise + :rtype: bool + """ + return self.binary_manager.validate_version() + + async def start_server(self) -> bool: + """ + Start aks-mcp server process. + + :return: True if server started successfully, False otherwise + :rtype: bool + :raises: Exception if server cannot be started + """ + import subprocess + from .user_feedback import ProgressReporter + from .._consts import CONST_MCP_DEFAULT_PORT + + # Check if server is already running + if self.is_server_running(): + if self.is_server_healthy(): + if self.verbose: + ProgressReporter.show_status_message("MCP server is already running and healthy", "info") + return True + # Server is running but unhealthy, stop it first + self.stop_server() + + # Ensure binary is available + if not self.is_binary_available(): + raise ServerError( + "MCP binary is not available for server startup", + "BINARY_UNAVAILABLE", + [ + "The MCP binary should be automatically downloaded", + "Check your internet connection for binary download", + "Try running the command again to retry download", + "Use --no-aks-mcp to disable MCP integration" + ] + ) + + # Find available port + self.server_port = self._find_available_port(CONST_MCP_DEFAULT_PORT) + self.server_url = f"http://localhost:{self.server_port}/sse" + + # Build command to start server + binary_path = self.get_binary_path() + cmd = [binary_path, "--transport", "sse", "--port", str(self.server_port)] + + try: + if self.verbose: + ProgressReporter.show_status_message(f"Starting MCP server on port {self.server_port}", "info") + + # Start the server process + self.server_process = await asyncio.create_subprocess_exec( + *cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + stdin=subprocess.PIPE + ) + + # Wait a moment for server to initialize + await asyncio.sleep(2) + + # Check if server is healthy + if self.is_server_healthy(): + if self.verbose: + ProgressReporter.show_status_message("MCP server started successfully", "info") + return True + # Server failed to start properly + self.stop_server() + raise RuntimeError(f"MCP server failed to start or is unhealthy on port {self.server_port}") + + except Exception as e: + # Clean up on failure + self.stop_server() + raise RuntimeError(f"Failed to start MCP server: {str(e)}") from e + + def stop_server(self) -> None: # pylint: disable=too-many-nested-blocks + """ + Stop aks-mcp server process. + """ + if self.server_process is not None: # pylint: disable=too-many-nested-blocks + try: + # Get process info for debugging + pid = getattr(self.server_process, 'pid', 'unknown') + + if self.verbose: + from .user_feedback import ProgressReporter + ProgressReporter.show_status_message( + f"Attempting graceful shutdown of MCP server (PID: {pid})", "info" + ) + + # Gracefully terminate the process + self.server_process.terminate() + + # Wait up to 8 seconds for graceful shutdown + import time + start_time = time.time() + timeout = 8.0 + check_interval = 0.1 + + while (time.time() - start_time) < timeout: + time.sleep(check_interval) + + # Check if process is actually still running using OS-level check + # This works even when asyncio event loop is closed + try: + if pid != 'unknown': + os.kill(pid, 0) # Signal 0 checks if process exists without killing it + # If we get here, process is still running + else: + # Fallback to asyncio returncode if PID unavailable + if self.server_process.returncode is not None: + break + except (OSError, ProcessLookupError): + # Process no longer exists - graceful shutdown succeeded! + if self.verbose: + from .user_feedback import ProgressReporter + elapsed = time.time() - start_time + ProgressReporter.show_status_message( + f"MCP server shut down gracefully in {elapsed:.2f}s", "info" + ) + return + + # Debug: Check every 2 seconds in verbose mode + if self.verbose and (time.time() - start_time) % 2.0 < check_interval: + elapsed = time.time() - start_time + ProgressReporter.show_status_message(f"Waiting for graceful shutdown... {elapsed:.1f}s", "info") + + # If we get here, timeout was reached and process might still be running + try: + if pid != 'unknown': + os.kill(pid, 0) # Check if still running + # Process is still running, need force kill + if self.verbose: + from .user_feedback import ProgressReporter + ProgressReporter.show_status_message( + f"Graceful shutdown timed out after {timeout}s, using force kill", "warning" + ) + + self.server_process.kill() + + # Wait up to 3 seconds for kill to complete + kill_start = time.time() + while (time.time() - kill_start) < 3.0: + time.sleep(0.1) + try: + os.kill(pid, 0) + except (OSError, ProcessLookupError): + if self.verbose: + ProgressReporter.show_status_message("MCP server force killed successfully", "info") + return + + if self.verbose: + ProgressReporter.show_status_message( + f"Warning: MCP server (PID: {pid}) may still be running", "warning" + ) + else: + # Fallback when PID not available + if self.verbose: + from .user_feedback import ProgressReporter + ProgressReporter.show_status_message( + "Process terminated (PID unavailable, assuming success)", "info" + ) + + except (OSError, ProcessLookupError): + # Process no longer exists - it terminated during our timeout check + if self.verbose: + from .user_feedback import ProgressReporter + ProgressReporter.show_status_message( + "MCP server terminated successfully (detected late)", "info" + ) + + except Exception as e: # pylint: disable=broad-exception-caught + if self.verbose: + from .user_feedback import ProgressReporter + ProgressReporter.show_status_message(f"Error stopping server: {str(e)}", "error") + finally: + self.server_process = None + self.server_url = None + self.server_port = None + + def is_server_running(self) -> bool: + """ + Check if server process is running. + + :return: True if server process is running, False otherwise + :rtype: bool + """ + if self.server_process is None: + return False + + # Check if process is still alive + return self.server_process.returncode is None + + def is_server_healthy(self) -> bool: + """ + Health check server via HTTP. + + :return: True if server is healthy and responding, False otherwise + :rtype: bool + """ + if not self.is_server_running() or self.server_url is None: + return False + + try: + import urllib.request + import urllib.error + + # Simple HTTP health check with short timeout + # The SSE endpoint should respond with appropriate headers even for GET requests + with urllib.request.urlopen(self.server_url, timeout=3) as response: + # Server is responding - consider it healthy + # SSE servers typically return 200 OK even for simple GET requests + return response.status == 200 + + except (urllib.error.URLError, urllib.error.HTTPError, OSError): + return False + + def _find_available_port(self, start_port: int = 8003) -> int: + """ + Find available port for server. + + :param start_port: Port to start searching from (default 8003) + :type start_port: int + :return: Available port number + :rtype: int + """ + import socket + + # Try the preferred port first + for port in range(start_port, start_port + 100): + try: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + sock.bind(('localhost', port)) + return port + except OSError: + continue + + # If no port is available in range, raise an exception + raise RuntimeError(f"No available ports found in range {start_port}-{start_port + 99}") + + def get_server_url(self) -> Optional[str]: + """ + Get the current server URL if server is running. + + :return: Server URL if available, None otherwise + :rtype: Optional[str] + """ + return self.server_url if self.is_server_running() else None + + def get_server_port(self) -> Optional[int]: + """ + Get the current server port if server is running. + + :return: Server port if available, None otherwise + :rtype: Optional[int] + """ + return self.server_port if self.is_server_running() else None diff --git a/src/aks-agent/azext_aks_agent/agent/prompt.py b/src/aks-agent/azext_aks_agent/agent/prompt.py index 7856c8d6817..554d72c8d66 100644 --- a/src/aks-agent/azext_aks_agent/agent/prompt.py +++ b/src/aks-agent/azext_aks_agent/agent/prompt.py @@ -3,7 +3,8 @@ # Licensed under the MIT License. See License.txt in the project root for license information. # -------------------------------------------------------------------------------------------- -AKS_CONTEXT_PROMPT = """ +# Base context template shared by both modes +_AKS_BASE_CONTEXT = """ # AKS-Specific Context and Workflow You are now operating in Azure Kubernetes Service (AKS) mode. All investigations must consider both Azure control plane and Kubernetes data plane components. @@ -23,6 +24,12 @@ {% else %} **Auto-discovery required** - Detect AKS context using this priority: +{% if is_mcp_mode %} +1. **Primary method**: Use MCP tools from aks-mcp server to get cluster context directly + - MCP tools provide the most comprehensive and reliable context discovery + - Call MCP tools to extract current AKS cluster information +2. **IMPORTANT**: Do not try to use traditional toolsets if MCP tools are not available, just report failure +{% else %} 1. **Primary method**: Check if `aks/core` toolset is available in your toolsets - If available, use the `aks/core` tools to get cluster context directly - This is the preferred method as it provides the most reliable context discovery @@ -30,6 +37,7 @@ - Get current Azure subscription ID - Extract AKS cluster name from current kubeconfig context - Find resource group by listing AKS clusters with matching name in the subscription +{% endif %} **Critical**: You MUST first check toolset availability before choosing the discovery method. @@ -73,15 +81,130 @@ **CRITICAL**: Before performing ANY Kubernetes operations (kubectl commands, checking pods, services, deployments, etc.), you MUST ALWAYS verify that the current kubectl context matches the target AKS cluster name. If it doesn't match, you MUST download the correct credentials and switch context before proceeding. This validation is required EVERY TIME you need to interact with Kubernetes resources, even if you've already validated Azure resources in the same session. **Only proceed with investigation after ALL validation steps pass successfully.** +""" + +# MCP Mode specific toolset instructions +_MCP_TOOLSET_INSTRUCTIONS = """ +## 🚀 MCP Mode - Enhanced AKS Troubleshooting via aks-mcp + +You are running in **MCP Mode** with the aks-mcp server providing MCP tools. + +### ✅ MCP Tool Usage Guidelines: +1. Prefer MCP tools over traditional toolsets when available. +2. Do not fallback to traditional toolsets if MCP tools are unavailable; just report failure. +3. Be explicit about the chosen MCP tool and why. -### AKS Investigation Approach -- **Start with cluster health** (nodes, system pods, control plane) -- **Check Azure-specific components** (load balancers, NSGs, managed identity) -- **Check Kubernetes-specific components** (deployments, services, ingress, namespaces, RBAC) -- **Analyze both Azure and Kubernetes logs** -- **Use AKS-aware tools** from available toolsets -- **Consider AKS limitations and best practices** +### 🧰 Common MCP Tools +When MCP tools are available, prefer these over manual commands: +1. `kubectl_diagnostics` (collects metrics and diagnostics for K8s resources) +2. `aks_core` (reads AKS control plane level info) +3. `network_diagnostics` (inspects network connectivity and related issues) -**Note**: "Cluster" in this context refers to both the Azure-managed AKS cluster AND the Kubernetes resources running within it. Both layers must be validated before proceeding. +### ❓ If unsure which tool to use +Ask the user for more details: +``` +To help me select the most appropriate tool, could you please: +- Specify if you know the exact MCP tool name that should be used +- Provide more specific details about what you want to accomplish +- Include any specific AKS/Kubernetes resource types or operations involved + +Examples of helpful details: +- "Use the [tool-name] tool to..." +- "I need to check pod logs in namespace X" +- "I want to diagnose node issues on cluster Y" +- "I need to examine AKS cluster networking configuration" + +This will help me select the right MCP tool for your specific needs. +``` + +5. **List available MCP tools** if you can see them in your toolset, so the user can choose the most appropriate one + +The MCP server provides comprehensive, integrated tools that combine Azure and Kubernetes operations with enhanced context awareness and intelligent troubleshooting capabilities. +""" +# Traditional Mode specific toolset instructions +_TRADITIONAL_TOOLSET_INSTRUCTIONS = """ +## 🔧 Traditional Mode - Standard AKS Toolset Configuration + +**IMPORTANT - Toolset Usage Guidelines:** +You are running in **Traditional Mode** with standard Holmes toolsets enabled. + +### 🎯 MANDATORY Tool Usage Rules for Traditional Mode: + +**For AKS and Kubernetes operations, follow this strict hierarchy:** + +1. **AKS Operations** - Use in this order: + - **First choice**: `aks/core` toolset tools + - **Second choice**: `aks/node-health` toolset tools + - **Last resort**: Manual instruction to user (avoid bash for complex operations) + +2. **Kubernetes Operations** - Use in this order: + - **First choice**: `kubernetes/core` toolset tools + - **Second choice**: `kubernetes/logs` toolset tools + - **Third choice**: `kubernetes/live-metrics` toolset tools + - **Last resort**: Manual instruction to user (avoid bash for complex operations) + +3. **CRITICAL RESTRICTIONS for bash toolset:** + - **NEVER use bash toolset** for `kubectl` operations + - **NEVER use bash toolset** for `az aks` operations + - **Bash is ONLY acceptable for**: + - Simple file operations (ls, cat, grep on log files) + - Basic system checks (ps, netstat, df) + - Text processing and parsing + - Non-AKS/Kubernetes administrative tasks + +**Why avoid bash for AKS/Kubernetes?** +- Dedicated toolsets provide better error handling +- Integrated context and state management +- Safer parameter validation +- Enhanced output formatting and parsing +- Consistent behavior across environments + +**Tool Selection Examples:** +- ❌ `bash: kubectl get pods` → ✅ `kubernetes/core: get pods` +- ❌ `bash: az aks show` → ✅ `aks/core: get cluster info` +- ✅ `bash: cat /tmp/debug.log` (file operation - acceptable) +- ✅ `bash: grep ERROR application.log` (text processing - acceptable) + +Use specialized toolsets for specialized tasks - they provide better integration, error handling, and user experience. """ + +# MCP Mode prompt template +AKS_CONTEXT_PROMPT_MCP = _AKS_BASE_CONTEXT + _MCP_TOOLSET_INSTRUCTIONS + """ +### AKS Investigation Approach (MCP Mode) +- **Start with cluster health** using MCP tools for comprehensive diagnostics +- **Check Azure-specific components** via MCP integration (load balancers, NSGs, managed identity) +- **Check Kubernetes-specific components** via MCP tools (deployments, services, ingress, namespaces, RBAC) +- **Analyze both Azure and Kubernetes logs** through unified MCP interface +- **Leverage MCP's enhanced capabilities** for cross-layer troubleshooting +- **Consider AKS limitations and best practices** with MCP-provided insights + +## 📚 Example Usage - MCP Enhanced Capabilities + +**Common MCP Tool Usage Examples:** + +1. **Resource Metrics & Diagnostics:** + - To get metrics of Kubernetes resources (e.g., node CPU usage), use `kubectl_diagnostics` tool in aks-mcp + - For memory usage analysis: `kubectl_diagnostics` provides detailed resource consumption data + - For storage metrics: `kubectl_diagnostics` can analyze PV/PVC usage and capacity + +**MCP Advantage:** These tools provide context-aware analysis that traditional kubectl/az commands cannot match, offering deeper insights and automated correlation across the entire AKS stack. + +**Note**: "Cluster" in this context refers to both the Azure-managed AKS cluster AND the Kubernetes resources running within it. MCP tools provide unified access to both layers with enhanced context awareness. +""" + +# Traditional Mode prompt template +AKS_CONTEXT_PROMPT_TRADITIONAL = _AKS_BASE_CONTEXT + _TRADITIONAL_TOOLSET_INSTRUCTIONS + """ +### AKS Investigation Approach (Traditional Mode) +- **Start with cluster health** using `aks/core` and `kubernetes/core` toolsets +- **Check Azure-specific components** using `aks/core` tools (load balancers, NSGs, managed identity) +- **Check Kubernetes-specific components** using `kubernetes/core` and `kubernetes/logs` toolsets (deployments, services, ingress, namespaces, RBAC) +- **Analyze logs** using dedicated `kubernetes/logs` toolset where possible +- **Use AKS-aware tools** from available specialized toolsets +- **Consider AKS limitations and best practices** within traditional toolset capabilities + +**Note**: "Cluster" in this context refers to both the Azure-managed AKS cluster AND the Kubernetes resources running within it. Use specialized toolsets for each layer rather than generic bash commands. +""" + +# Default prompt (for backward compatibility) +AKS_CONTEXT_PROMPT = AKS_CONTEXT_PROMPT_TRADITIONAL diff --git a/src/aks-agent/azext_aks_agent/agent/status.py b/src/aks-agent/azext_aks_agent/agent/status.py new file mode 100644 index 00000000000..33eb1bdfc97 --- /dev/null +++ b/src/aks-agent/azext_aks_agent/agent/status.py @@ -0,0 +1,352 @@ +""" +Status collection and management for AKS Agent. + +This module provides comprehensive status collection for the AKS Agent, +including MCP binary status, server status, and configuration status. +""" + +import os +import json +import psutil +from datetime import datetime +from typing import Optional, Dict, Any + +from azure.cli.core.api import get_config_dir + +from .status_models import AgentStatus, BinaryStatus, ServerStatus, ConfigStatus +from .binary_manager import AksMcpBinaryManager +from .mcp_manager import MCPManager +from .config_generator import ConfigurationGenerator +from .._consts import ( + CONST_MCP_BINARY_DIR, + CONST_AGENT_CONFIG_FILE_NAME +) + + +class AgentStatusManager: # pylint: disable=too-few-public-methods + """Manages agent status collection and reporting.""" + + def __init__(self, config_dir: str = None): + """ + Initialize status manager. + + :param config_dir: Configuration directory path (defaults to Azure CLI config dir) + :type config_dir: Optional[str] + """ + self.config_dir = config_dir or get_config_dir() + self.binary_manager = AksMcpBinaryManager( + os.path.join(self.config_dir, CONST_MCP_BINARY_DIR) + ) + + async def get_status(self) -> AgentStatus: + """ + Get comprehensive agent status. + + :param verbose: Include verbose information in status + :type verbose: bool + :return: Complete agent status information + :rtype: AgentStatus + """ + try: + # Collect status from all components + binary_status = self._get_mcp_binary_status() + server_status = await self._get_server_status() + config_status = self._get_configuration_status() + + # Determine current mode + current_mode = self._determine_current_mode(config_status, binary_status, server_status) + + # Get last used timestamp + last_used = self._get_last_used_timestamp() + + # Create comprehensive status + status = AgentStatus( + mode=current_mode, + mcp_binary=binary_status, + server=server_status, + config=config_status, + last_used=last_used + ) + + return status + + except Exception as e: # pylint: disable=broad-exception-caught + return AgentStatus( + mode="error", + error_message=f"Status collection failed: {str(e)}" + ) + + def _get_mcp_binary_status(self) -> BinaryStatus: + """ + Collect MCP binary status. + + :return: Binary status information + :rtype: BinaryStatus + """ + try: + binary_path = self.binary_manager.get_binary_path() + + if not os.path.exists(binary_path): + return BinaryStatus( + available=False, + path=binary_path, + error_message="Binary not found" + ) + + # Get version and validate + version = self.binary_manager.get_binary_version() + version_valid = self.binary_manager.validate_version() + + # Create status from file information + return BinaryStatus.from_file_path( + binary_path, + version=version, + version_valid=version_valid + ) + + except Exception as e: # pylint: disable=broad-exception-caught + return BinaryStatus( + available=False, + error_message=f"Binary status check failed: {str(e)}" + ) + + async def _get_server_status(self) -> ServerStatus: + """ + Collect MCP server status. + + :return: Server status information + :rtype: ServerStatus + """ + try: + # Create temporary MCP manager to check server status + mcp_manager = MCPManager(config_dir=self.config_dir, verbose=False) + + # Check if server process is running + is_running = mcp_manager.is_server_running() + is_healthy = False + server_url = None + server_port = None + server_pid = None + uptime = None + start_time = None + error_message = None + + if is_running: + # Get server details + server_url = mcp_manager.get_server_url() + server_port = mcp_manager.get_server_port() + + # Try to get process information + if hasattr(mcp_manager, 'server_process') and mcp_manager.server_process: + try: + server_pid = mcp_manager.server_process.pid + + # Get process start time and calculate uptime + if server_pid: + process = psutil.Process(server_pid) + start_time = datetime.fromtimestamp(process.create_time()) + uptime = datetime.now() - start_time + except (psutil.NoSuchProcess, psutil.AccessDenied): + # Process might have died or no access + is_running = False + error_message = "Server process not accessible" + + # Check server health + if is_running: + is_healthy = mcp_manager.is_server_healthy() + if not is_healthy: + error_message = "Server health check failed" + + return ServerStatus( + running=is_running, + healthy=is_healthy, + url=server_url, + port=server_port, + pid=server_pid, + uptime=uptime, + start_time=start_time, + error_message=error_message + ) + + except Exception as e: # pylint: disable=broad-exception-caught + return ServerStatus( + running=False, + healthy=False, + error_message=f"Server status check failed: {str(e)}" + ) + + def _get_configuration_status(self) -> ConfigStatus: + """ + Collect configuration status. + + :return: Configuration status information + :rtype: ConfigStatus + """ + try: + # Get current mode from state file + current_mode = self._get_last_mode() + + # Get configuration file path + config_file_path = os.path.join(self.config_dir, CONST_AGENT_CONFIG_FILE_NAME) + config_file = config_file_path if os.path.exists(config_file_path) else None + + # Initialize status + toolsets_enabled = [] + mcp_servers = [] + config_valid = True + error_message = None + last_mode_change = self._get_last_mode_change_time() + + # Try to parse configuration if it exists + if config_file: + try: + config_data = self._load_config_file(config_file) + if config_data: + # Extract toolsets information + toolsets = config_data.get("toolsets", {}) + toolsets_enabled = [ + name for name, config in toolsets.items() + if config.get("enabled", True) + ] + + # Extract MCP servers information + mcp_servers_config = config_data.get("mcp_servers", {}) + mcp_servers = list(mcp_servers_config.keys()) + + # Validate configuration based on mode + if current_mode == "mcp": + config_valid = ConfigurationGenerator.validate_mcp_config(config_data) + elif current_mode == "traditional": + config_valid = ConfigurationGenerator.validate_traditional_config(config_data) + + except Exception as e: # pylint: disable=broad-exception-caught + config_valid = False + error_message = f"Configuration parsing failed: {str(e)}" + + return ConfigStatus( + mode=current_mode, + config_file=config_file, + toolsets_enabled=toolsets_enabled, + mcp_servers=mcp_servers, + last_mode_change=last_mode_change, + config_valid=config_valid, + error_message=error_message + ) + + except Exception as e: # pylint: disable=broad-exception-caught + return ConfigStatus( + mode="unknown", + config_valid=False, + error_message=f"Configuration status check failed: {str(e)}" + ) + + def _determine_current_mode(self, config_status: ConfigStatus, + binary_status: BinaryStatus, + server_status: ServerStatus) -> str: + """ + Determine the current operational mode based on component status. + + :param config_status: Configuration status + :param binary_status: Binary status + :param server_status: Server status + :return: Current mode string + :rtype: str + """ + # Check configuration mode first + if config_status.is_mcp_mode: + return "mcp" + if config_status.is_traditional_mode: + return "traditional" + + # Infer mode from component status + if binary_status.ready and server_status.running: + return "mcp" + if binary_status.available and not server_status.running: + return "mcp_available" + return "traditional" + + def _get_last_mode(self) -> str: + """ + Get the last used mode from state file. + + :return: Last mode string or 'unknown' if not found + :rtype: str + """ + try: + state_file_path = os.path.join(self.config_dir, "aks_agent_mode_state") + if os.path.exists(state_file_path): + with open(state_file_path, 'r', encoding='utf-8') as f: + state_data = json.load(f) + return state_data.get('last_mode', 'unknown') + except Exception: # pylint: disable=broad-exception-caught + pass + return 'unknown' + + def _get_last_mode_change_time(self) -> Optional[datetime]: + """ + Get the timestamp of the last mode change. + + :return: Last mode change timestamp or None if not available + :rtype: Optional[datetime] + """ + try: + state_file_path = os.path.join(self.config_dir, "aks_agent_mode_state") + if os.path.exists(state_file_path): + # Use file modification time as proxy for mode change time + mod_time = os.path.getmtime(state_file_path) + return datetime.fromtimestamp(mod_time) + except Exception: # pylint: disable=broad-exception-caught + pass + return None + + def _get_last_used_timestamp(self) -> Optional[datetime]: + """ + Get the timestamp when the agent was last used. + + :return: Last used timestamp or None if not available + :rtype: Optional[datetime] + """ + try: + # Check various files to determine last usage + potential_files = [ + os.path.join(self.config_dir, CONST_AGENT_CONFIG_FILE_NAME), + os.path.join(self.config_dir, "aks_agent_mode_state"), + os.path.join(self.config_dir, CONST_MCP_BINARY_DIR, "aks-mcp") + ] + + latest_time = None + for file_path in potential_files: + if os.path.exists(file_path): + mod_time = datetime.fromtimestamp(os.path.getmtime(file_path)) + if latest_time is None or mod_time > latest_time: + latest_time = mod_time + + return latest_time + + except Exception: # pylint: disable=broad-exception-caught + return None + + def _load_config_file(self, config_file_path: str) -> Optional[Dict[str, Any]]: + """ + Load and parse configuration file. + + :param config_file_path: Path to configuration file + :type config_file_path: str + :return: Parsed configuration data or None if failed + :rtype: Optional[Dict[str, Any]] + """ + try: + with open(config_file_path, 'r', encoding='utf-8') as f: + # Try JSON first, then YAML if available + content = f.read().strip() + if content.startswith('{'): + return json.loads(content) + # Try YAML parsing if content doesn't look like JSON + try: + import yaml + return yaml.safe_load(content) + except ImportError: + # YAML not available, assume it's JSON-like + return json.loads(content) + except Exception: # pylint: disable=broad-exception-caught + return None diff --git a/src/aks-agent/azext_aks_agent/agent/status_models.py b/src/aks-agent/azext_aks_agent/agent/status_models.py new file mode 100644 index 00000000000..274c78468a4 --- /dev/null +++ b/src/aks-agent/azext_aks_agent/agent/status_models.py @@ -0,0 +1,254 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +"""Status data models for AKS agent components.""" + +import os +from dataclasses import dataclass +from datetime import datetime, timedelta +from typing import List, Optional + + +@dataclass +class BinaryStatus: + """MCP binary status information.""" + + available: bool = False + version: Optional[str] = None + path: Optional[str] = None + last_updated: Optional[datetime] = None + size: Optional[int] = None + version_valid: bool = False + error_message: Optional[str] = None + + @property + def ready(self) -> bool: + """Check if binary is ready for use.""" + return self.available and self.version_valid + + @classmethod + def from_file_path( + cls, + file_path: str, + version: Optional[str] = None, + version_valid: bool = False + ) -> "BinaryStatus": + """Create BinaryStatus from file path with metadata.""" + if not file_path or not os.path.exists(file_path): + return cls(available=False, path=file_path) + + try: + stat = os.stat(file_path) + size = stat.st_size + last_updated = datetime.fromtimestamp(stat.st_mtime) + + return cls( + available=True, + version=version, + path=file_path, + last_updated=last_updated, + size=size, + version_valid=version_valid + ) + except OSError as e: + return cls( + available=False, + path=file_path, + error_message=f"Failed to get file info: {str(e)}" + ) + + +@dataclass +class ServerStatus: # pylint: disable=too-many-instance-attributes + """MCP server status information.""" + + running: bool = False + healthy: bool = False + url: Optional[str] = None + port: Optional[int] = None + pid: Optional[int] = None + uptime: Optional[timedelta] = None + start_time: Optional[datetime] = None + error_message: Optional[str] = None + + @property + def status_text(self) -> str: + """Get human-readable status text.""" + if not self.running: + return "Stopped" + if not self.healthy: + return "Running (Unhealthy)" + return "Running (Healthy)" + + @property + def uptime_text(self) -> str: + """Get human-readable uptime text.""" + if not self.uptime: + return "N/A" + + total_seconds = int(self.uptime.total_seconds()) + hours, remainder = divmod(total_seconds, 3600) + minutes, seconds = divmod(remainder, 60) + + if hours > 0: + return f"{hours}h {minutes}m {seconds}s" + if minutes > 0: + return f"{minutes}m {seconds}s" + return f"{seconds}s" + + +@dataclass +class ConfigStatus: + """Configuration status information.""" + + mode: str = "unknown" + config_file: Optional[str] = None + toolsets_enabled: List[str] = None + mcp_servers: List[str] = None + last_mode_change: Optional[datetime] = None + config_valid: bool = True + error_message: Optional[str] = None + + def __post_init__(self): + """Initialize mutable default values.""" + if self.toolsets_enabled is None: + self.toolsets_enabled = [] + if self.mcp_servers is None: + self.mcp_servers = [] + + @property + def is_mcp_mode(self) -> bool: + """Check if currently in MCP mode.""" + return self.mode.lower() == "mcp" + + @property + def is_traditional_mode(self) -> bool: + """Check if currently in traditional mode.""" + return self.mode.lower() == "traditional" + + @property + def active_toolsets_count(self) -> int: + """Get count of active toolsets.""" + return len(self.toolsets_enabled) + + @property + def mcp_servers_count(self) -> int: + """Get count of configured MCP servers.""" + return len(self.mcp_servers) + + +@dataclass +class AgentStatus: + """Complete agent status information.""" + + mode: str = "unknown" + mcp_binary: Optional[BinaryStatus] = None + server: Optional[ServerStatus] = None + config: Optional[ConfigStatus] = None + last_used: Optional[datetime] = None + overall_health: str = "unknown" + error_message: Optional[str] = None + + def __post_init__(self): + """Initialize default status objects if not provided.""" + if self.mcp_binary is None: + self.mcp_binary = BinaryStatus() + if self.server is None: + self.server = ServerStatus() + if self.config is None: + self.config = ConfigStatus() + + # Update overall health based on component status + self._update_overall_health() + + def _update_overall_health(self): + """Update overall health status based on component status.""" + if self.error_message: + self.overall_health = "error" + return + + # In MCP mode, check all components + if self.config.is_mcp_mode: + if not self.mcp_binary.ready: + self.overall_health = "degraded" + elif not self.server.running: + self.overall_health = "degraded" + elif not self.server.healthy: + self.overall_health = "degraded" + else: + self.overall_health = "healthy" + # In traditional mode, only check config + elif self.config.is_traditional_mode: + if self.config.config_valid: + self.overall_health = "healthy" + else: + self.overall_health = "degraded" + else: + self.overall_health = "unknown" + + @property + def is_healthy(self) -> bool: + """Check if agent is in healthy state.""" + return self.overall_health == "healthy" + + @property + def is_operational(self) -> bool: + """Check if agent is operational (healthy or degraded).""" + return self.overall_health in ["healthy", "degraded"] + + @property + def health_emoji(self) -> str: + """Get emoji representation of health status.""" + health_map = { + "healthy": "✅", + "degraded": "⚠️", + "error": "❌", + "unknown": "❓" + } + return health_map.get(self.overall_health, "❓") + + @property + def mode_emoji(self) -> str: + """Get emoji representation of mode.""" + if self.config.is_mcp_mode: + return "🚀" # MCP mode - enhanced + if self.config.is_traditional_mode: + return "🔧" # Traditional mode - tools + return "❓" # Unknown mode + + def get_summary(self) -> str: + """Get a one-line status summary.""" + health_text = self.overall_health.title() + mode_text = self.mode.title() + return f"{self.health_emoji} {health_text} | {self.mode_emoji} {mode_text} Mode" + + def get_recommendations(self) -> List[str]: + """Get status-based recommendations for the user.""" + recommendations = [] + + # MCP mode specific recommendations + if self.config.is_mcp_mode: + if not self.mcp_binary.available: + recommendations.append("Download the MCP binary to enable enhanced capabilities") + elif not self.mcp_binary.version_valid: + recommendations.append("Update the MCP binary to the latest version") + elif not self.server.running: + recommendations.append("Start the MCP server for enhanced functionality") + elif not self.server.healthy: + recommendations.append("Check MCP server logs for health issues") + + # Traditional mode specific recommendations + elif self.config.is_traditional_mode: + if self.mcp_binary.ready: + recommendations.append("Consider using MCP mode for enhanced capabilities (remove --no-aks-mcp)") + + # General recommendations + if not self.config.config_valid: + recommendations.append("Check configuration file for syntax errors") + + if self.error_message: + recommendations.append(f"Resolve error: {self.error_message}") + + return recommendations diff --git a/src/aks-agent/azext_aks_agent/agent/user_feedback.py b/src/aks-agent/azext_aks_agent/agent/user_feedback.py new file mode 100644 index 00000000000..04677f60166 --- /dev/null +++ b/src/aks-agent/azext_aks_agent/agent/user_feedback.py @@ -0,0 +1,108 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +"""User feedback module for AKS Agent operations.""" + +import sys + + +class ProgressReporter: + """Provides user feedback for long-running operations like binary downloads.""" + + @staticmethod + def show_download_progress(downloaded: int, total: int, filename: str, console=None) -> None: + """ + Show download progress with progress bar. + + :param downloaded: Bytes downloaded so far + :param total: Total bytes to download + :param filename: Name of file being downloaded + :param console: Console object for output (optional, will use Holmes console if available) + """ + if total <= 0: + return + + percentage = min(100, (downloaded / total) * 100) + + # If no console provided, try to get Holmes console, otherwise fall back to print + if console is None: + try: + from holmes.utils.console.logging import init_logging + console = init_logging([]) + except ImportError: + # Fallback to simple print if Holmes not available + print(f"Downloading {filename}: {percentage:.1f}% ({downloaded}/{total} bytes)") + return + + # Only show progress if we have a TTY (interactive terminal) + if not sys.stdout.isatty(): + return + + # Create a simple progress bar using rich console formatting + bar_width = 40 + filled_width = int(bar_width * percentage / 100) + progress_bar = "█" * filled_width + "░" * (bar_width - filled_width) + + console.print(f"\r[cyan]Downloading {filename}[/cyan] [{progress_bar}] {percentage:.1f}%", end="") + + # Print newline when download is complete + if percentage >= 100: + console.print("") # New line after completion + + @staticmethod + def show_status_message(message: str, level: str = "info", console=None) -> None: + """ + Show status message with appropriate styling. + + :param message: Message to display + :param level: Message level ('info', 'warning', 'error', 'success') + :param console: Console object for output (optional, will use Holmes console if available) + """ + # If no console provided, try to get Holmes console, otherwise fall back to print + if console is None: + try: + from holmes.utils.console.logging import init_logging + console = init_logging([]) + except ImportError: + # Fallback to simple print if Holmes not available + print(f"[{level.upper()}] {message}") + return + + # Map levels to rich console styles + level_styles = { + "info": "[cyan]", + "warning": "[yellow]", + "error": "[red]", + "success": "[green]" + } + + style = level_styles.get(level.lower(), "[cyan]") + console.print(f"{style}{message}[/{style[1:-1]}]") + + @staticmethod + def show_binary_setup_status(status: str, console=None) -> None: + """ + Show binary setup status message. + + :param status: Status message to display + :param console: Console object for output (optional, will use Holmes console if available) + """ + ProgressReporter.show_status_message(f"MCP Binary: {status}", "info", console) + + @staticmethod + def show_server_status(status: str, silent_mode: bool = True, console=None) -> None: + """ + Show server status (only in verbose mode by default). + + :param status: Server status message + :param silent_mode: If True, only show in verbose mode + :param console: Console object for output (optional, will use Holmes console if available) + """ + if silent_mode: + # Only show server status in verbose mode - implementation depends on verbose flag + # For now, we'll show it as it's useful for debugging + pass + + ProgressReporter.show_status_message(f"MCP Server: {status}", "info", console) diff --git a/src/aks-agent/azext_aks_agent/commands.py b/src/aks-agent/azext_aks_agent/commands.py index 726dbb56590..93205a8739e 100644 --- a/src/aks-agent/azext_aks_agent/commands.py +++ b/src/aks-agent/azext_aks_agent/commands.py @@ -14,3 +14,7 @@ def load_command_table(self, _): "aks", ) as g: g.custom_command("agent", "aks_agent") + with self.command_group( + "aks agent", + ) as g: + g.custom_command("status", "aks_agent_status") diff --git a/src/aks-agent/azext_aks_agent/custom.py b/src/aks-agent/azext_aks_agent/custom.py index 20cc1f68625..0cbfe533265 100644 --- a/src/aks-agent/azext_aks_agent/custom.py +++ b/src/aks-agent/azext_aks_agent/custom.py @@ -4,6 +4,7 @@ # -------------------------------------------------------------------------------------------- # pylint: disable=too-many-lines, disable=broad-except +import os from azext_aks_agent.agent.agent import aks_agent as aks_agent_internal from knack.log import get_logger @@ -26,6 +27,7 @@ def aks_agent( no_echo_request=False, show_tool_output=False, refresh_toolsets=False, + no_aks_mcp=False, ): aks_agent_internal( @@ -41,4 +43,232 @@ def aks_agent( no_echo_request, show_tool_output, refresh_toolsets, + no_aks_mcp, ) + + +def aks_agent_status(cmd): + """ + Show AKS agent configuration and status. + + :param cmd: Azure CLI command context + :return: None (displays status via console output) + """ + try: + from azext_aks_agent.agent.binary_manager import AksMcpBinaryManager + from azext_aks_agent.agent.mcp_manager import MCPManager + from azure.cli.core.api import get_config_dir + from azext_aks_agent._consts import CONST_MCP_BINARY_DIR + + # Initialize status information + status_info = { + "mode": "unknown", + "mcp_binary": { + "available": False, + "path": None, + "version": None + }, + "server": { + "running": False, + "healthy": False, + "url": None, + "port": None + } + } + + try: + # Check MCP binary status + config_dir = get_config_dir() + binary_manager = AksMcpBinaryManager( + os.path.join(config_dir, CONST_MCP_BINARY_DIR) + ) + + # Binary information + binary_available = binary_manager.is_binary_available() + binary_version = binary_manager.get_binary_version() if binary_available else None + binary_path = binary_manager.get_binary_path() + + status_info["mcp_binary"] = { + "available": binary_available, + "path": binary_path, + "version": binary_version, + "version_valid": binary_manager.validate_version() if binary_available else False + } + + # Determine mode based on binary availability + if binary_available and status_info["mcp_binary"]["version_valid"]: + status_info["mode"] = "mcp_ready" + + # Check server status if binary is available + try: + mcp_manager = MCPManager(config_dir, verbose=False) + + status_info["server"] = { + "running": mcp_manager.is_server_running(), + "healthy": mcp_manager.is_server_healthy(), + "url": mcp_manager.get_server_url(), + "port": mcp_manager.get_server_port() + } + + except Exception as e: + logger.debug("Failed to get server status: %s", str(e)) + status_info["server"]["error"] = str(e) + else: + status_info["mode"] = "traditional" + + except Exception as e: + logger.debug("Failed to get binary status: %s", str(e)) + status_info["mcp_binary"]["error"] = str(e) + status_info["mode"] = "traditional" + + # Display status with enhanced formatting + _display_agent_status(status_info) + + # Return None to avoid CLI framework printing the return value + return None + + except Exception as e: + from knack.util import CLIError + raise CLIError(f"Failed to get agent status: {str(e)}") + + +def _display_agent_status(status_info): + """Display formatted status with rich console output.""" + from rich.console import Console + from rich.table import Table + from rich.panel import Panel + + console = Console() + + # Title with emoji + mode_emoji = _get_mode_emoji(status_info.get("mode", "unknown")) + health_emoji = _get_health_emoji(status_info) + + title = f"{health_emoji} AKS Agent Status {mode_emoji}" + console.print(Panel.fit(title, style="bold blue")) + + # Create status table + table = Table(show_header=True, header_style="bold magenta") + table.add_column("Component", style="cyan", width=15) + table.add_column("Status", style="green", width=20) + table.add_column("Details", style="white", width=40) + + # Mode row + mode_text = status_info.get("mode", "unknown").replace("_", " ").title() + table.add_row("Mode", f"{mode_emoji} {mode_text}", "") + + # MCP Binary status + binary_info = status_info.get("mcp_binary", {}) + binary_status = "✅ Available" if binary_info.get("available") else "❌ Not available" + binary_details = [] + + if binary_info.get("version"): + version_valid = binary_info.get("version_valid", True) + version_indicator = "✅" if version_valid else "⚠️" + binary_details.append(f"{version_indicator} v{binary_info['version']}") + + if binary_info.get("error"): + binary_details.append(f"❌ {binary_info['error']}") + + table.add_row("MCP Binary", binary_status, " | ".join(binary_details)) + + # Server status (only if binary is available) + if binary_info.get("available") and status_info.get("mode") in ["mcp_ready", "mcp"]: + server_info = status_info.get("server", {}) + server_status = "" + server_details = [] + + if server_info.get("running"): + if server_info.get("healthy"): + server_status = "✅ Running & Healthy" + else: + server_status = "⚠️ Running (Unhealthy)" + else: + server_status = "❌ Not Running" + + if server_info.get("port"): + server_details.append(f"Port: {server_info['port']}") + + if server_info.get("error"): + server_details.append(f"❌ {server_info['error']}") + + table.add_row("MCP Server", server_status, " | ".join(server_details)) + + console.print(table) + + # Display recommendations + _display_recommendations(console, status_info) + + +def _display_recommendations(console, status_info): + """Display status-based recommendations using rich console.""" + recommendations = _get_recommendations(status_info) + + if not recommendations: + return + + console.print("\n💡 Recommendations:", style="bold yellow") + for rec in recommendations: + console.print(f" • {rec}", style="yellow") + + +def _get_recommendations(status_info): + """Get status-based recommendations for the user.""" + recommendations = [] + binary_info = status_info.get("mcp_binary", {}) + server_info = status_info.get("server", {}) + mode = status_info.get("mode", "unknown") + + if not binary_info.get("available"): + recommendations.append("Run 'az aks agent' to automatically download the MCP binary for enhanced capabilities") + elif not binary_info.get("version_valid", True): + recommendations.append("Update the MCP binary by running 'az aks agent --refresh-toolsets'") + elif mode == "mcp_ready" and not server_info.get("running"): + recommendations.append("MCP binary is ready - run 'az aks agent' to start using enhanced capabilities") + elif mode == "mcp_ready" and server_info.get("running") and not server_info.get("healthy"): + recommendations.append("MCP server is running but unhealthy - it will be automatically restarted on next use") + elif mode in ["mcp_ready", "mcp"] and server_info.get("running") and server_info.get("healthy"): + recommendations.append("✅ AKS agent is ready with enhanced MCP capabilities") + elif mode == "traditional": + if binary_info.get("available"): + recommendations.append( + "Consider using MCP mode for enhanced capabilities by running 'az aks agent' " + "(remove --no-aks-mcp if used)" + ) + else: + recommendations.append("✅ AKS agent is ready in traditional mode") + else: + recommendations.append("✅ AKS agent is operational") + + return recommendations + + +def _get_mode_emoji(mode): + """Get emoji representation of mode.""" + mode_emojis = { + "mcp": "🚀", + "mcp_ready": "🚀", + "traditional": "🔧", + "unknown": "❓" + } + return mode_emojis.get(mode, "❓") + + +def _get_health_emoji(status_info): + """Get emoji representation of overall health status.""" + binary_info = status_info.get("mcp_binary", {}) + server_info = status_info.get("server", {}) + mode = status_info.get("mode", "unknown") + + # Determine health based on mode and component status + if mode == "traditional": + return "✅" # Traditional mode is always healthy if working + if mode in ["mcp_ready", "mcp"]: + if binary_info.get("available") and binary_info.get("version_valid", True): + if server_info.get("running") and server_info.get("healthy"): + return "✅" # Fully healthy + if server_info.get("running"): + return "⚠️" # Running but not healthy + return "⚠️" # Binary ready but server not running + return "❌" # Binary issues + return "❓" # Unknown state diff --git a/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_binary_manager.py b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_binary_manager.py new file mode 100644 index 00000000000..038b5808f7a --- /dev/null +++ b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_binary_manager.py @@ -0,0 +1,602 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +import unittest +from unittest import IsolatedAsyncioTestCase +import os +import tempfile +import platform +import stat +import subprocess +from unittest.mock import Mock, patch, AsyncMock + +from azext_aks_agent.agent.binary_manager import AksMcpBinaryManager + + +# Use IsolatedAsyncioTestCase for proper async test method support +class TestAksMcpBinaryManager(IsolatedAsyncioTestCase): + + def setUp(self): + """Set up test fixtures.""" + self.test_install_dir = tempfile.mkdtemp() + self.binary_manager = AksMcpBinaryManager(self.test_install_dir) + + def tearDown(self): + """Clean up test fixtures.""" + import shutil + shutil.rmtree(self.test_install_dir, ignore_errors=True) + + def test_get_binary_path_linux(self): + """Test binary path resolution on Linux.""" + with patch('platform.system', return_value='Linux'): + manager = AksMcpBinaryManager('/test/dir') + expected_path = os.path.join('/test/dir', 'aks-mcp') + self.assertEqual(manager.get_binary_path(), expected_path) + + def test_get_binary_path_windows(self): + """Test binary path resolution on Windows.""" + with patch('platform.system', return_value='Windows'): + manager = AksMcpBinaryManager('/test/dir') + expected_path = os.path.join('/test/dir', 'aks-mcp.exe') + self.assertEqual(manager.get_binary_path(), expected_path) + + def test_get_binary_path_darwin(self): + """Test binary path resolution on macOS.""" + with patch('platform.system', return_value='Darwin'): + manager = AksMcpBinaryManager('/test/dir') + expected_path = os.path.join('/test/dir', 'aks-mcp') + self.assertEqual(manager.get_binary_path(), expected_path) + + def test_is_binary_available_not_exists(self): + """Test binary availability when file doesn't exist.""" + self.assertFalse(self.binary_manager.is_binary_available()) + + def test_is_binary_available_exists_but_not_executable(self): + """Test binary availability when file exists but is not executable.""" + # Create a non-executable file + with open(self.binary_manager.binary_path, 'w') as f: + f.write('dummy content') + os.chmod(self.binary_manager.binary_path, 0o644) # Read/write but not execute + + self.assertFalse(self.binary_manager.is_binary_available()) + + def test_is_binary_available_exists_and_executable(self): + """Test binary availability when file exists and is executable.""" + # Create an executable file + with open(self.binary_manager.binary_path, 'w') as f: + f.write('dummy content') + os.chmod(self.binary_manager.binary_path, 0o755) # Read/write/execute + + self.assertTrue(self.binary_manager.is_binary_available()) + + @patch('os.access') + def test_is_binary_available_os_error(self, mock_access): + """Test binary availability when os.access raises OSError.""" + # Create file first + with open(self.binary_manager.binary_path, 'w') as f: + f.write('dummy content') + + mock_access.side_effect = OSError("Permission denied") + self.assertFalse(self.binary_manager.is_binary_available()) + + def test_get_binary_version_not_available(self): + """Test version retrieval when binary is not available.""" + self.assertIsNone(self.binary_manager.get_binary_version()) + + @patch('subprocess.run') + def test_get_binary_version_success(self, mock_run): + """Test successful version retrieval.""" + # Create an executable file + with open(self.binary_manager.binary_path, 'w') as f: + f.write('#!/bin/bash\necho "aks-mcp version 0.1.0"') + os.chmod(self.binary_manager.binary_path, 0o755) + + # Mock subprocess.run + mock_result = Mock() + mock_result.returncode = 0 + mock_result.stdout = "aks-mcp version 0.1.0\n" + mock_run.return_value = mock_result + + version = self.binary_manager.get_binary_version() + self.assertEqual(version, "0.1.0") + mock_run.assert_called_once_with( + [self.binary_manager.binary_path, "--version"], + capture_output=True, + text=True, + timeout=10, + check=False + ) + + @patch('subprocess.run') + def test_get_binary_version_different_format(self, mock_run): + """Test version retrieval with different output format.""" + # Create an executable file + with open(self.binary_manager.binary_path, 'w') as f: + f.write('dummy content') + os.chmod(self.binary_manager.binary_path, 0o755) + + # Different format + mock_result = Mock() + mock_result.returncode = 0 + mock_result.stdout = "version: 1.2.3\n" + mock_run.return_value = mock_result + + version = self.binary_manager.get_binary_version() + self.assertEqual(version, "1.2.3") + + @patch('subprocess.run') + def test_get_binary_version_actual_format(self, mock_run): + """Test version retrieval with actual aks-mcp version format.""" + # Create an executable file + with open(self.binary_manager.binary_path, 'w') as f: + f.write('dummy content') + os.chmod(self.binary_manager.binary_path, 0o755) + + # Mock subprocess.run with actual aks-mcp output format + mock_result = Mock() + mock_result.returncode = 0 + mock_result.stdout = """aks-mcp version v0.0.6-16-ga7464eb+2025-08-18T13:33:38Z +Git commit: a7464eb458bcb138599519a0281b1047cc63b749 +Git tree state: clean +Go version: go1.24.5 +Platform: darwin/arm64 +""" + mock_run.return_value = mock_result + + version = self.binary_manager.get_binary_version() + self.assertEqual(version, "0.0.6") + + @patch('subprocess.run') + def test_get_binary_version_git_format_variations(self, mock_run): + """Test version retrieval with different git-style version formats.""" + # Create an executable file + with open(self.binary_manager.binary_path, 'w') as f: + f.write('dummy content') + os.chmod(self.binary_manager.binary_path, 0o755) + + test_cases = [ + ("aks-mcp version v0.1.0", "0.1.0"), + ("aks-mcp version v0.1.0-5-g123abc", "0.1.0"), + ("aks-mcp version v1.2.3-10-gabc123+2025-01-01T12:00:00Z", "1.2.3"), + ("version v0.0.7-dirty", "0.0.7"), + ] + + for output, expected_version in test_cases: + with self.subTest(output=output): + mock_result = Mock() + mock_result.returncode = 0 + mock_result.stdout = output + "\n" + mock_run.return_value = mock_result + + version = self.binary_manager.get_binary_version() + self.assertEqual(version, expected_version) + + @patch('subprocess.run') + def test_get_binary_version_subprocess_error(self, mock_run): + """Test version retrieval when subprocess fails.""" + # Create an executable file + with open(self.binary_manager.binary_path, 'w') as f: + f.write('dummy content') + os.chmod(self.binary_manager.binary_path, 0o755) + + mock_run.side_effect = subprocess.SubprocessError("Command failed") + version = self.binary_manager.get_binary_version() + self.assertIsNone(version) + + @patch('subprocess.run') + def test_get_binary_version_timeout(self, mock_run): + """Test version retrieval when subprocess times out.""" + # Create an executable file + with open(self.binary_manager.binary_path, 'w') as f: + f.write('dummy content') + os.chmod(self.binary_manager.binary_path, 0o755) + + mock_run.side_effect = subprocess.TimeoutExpired("cmd", 10) + version = self.binary_manager.get_binary_version() + self.assertIsNone(version) + + @patch('subprocess.run') + def test_get_binary_version_non_zero_exit(self, mock_run): + """Test version retrieval when command returns non-zero exit code.""" + # Create an executable file + with open(self.binary_manager.binary_path, 'w') as f: + f.write('dummy content') + os.chmod(self.binary_manager.binary_path, 0o755) + + mock_result = Mock() + mock_result.returncode = 1 + mock_result.stdout = "error" + mock_run.return_value = mock_result + + version = self.binary_manager.get_binary_version() + self.assertIsNone(version) + + @patch.object(AksMcpBinaryManager, 'get_binary_version') + def test_validate_version_success(self, mock_get_version): + """Test successful version validation.""" + mock_get_version.return_value = "0.1.0" + self.assertTrue(self.binary_manager.validate_version("0.0.6")) + self.assertTrue(self.binary_manager.validate_version("0.1.0")) + self.assertFalse(self.binary_manager.validate_version("0.2.0")) + + @patch.object(AksMcpBinaryManager, 'get_binary_version') + def test_validate_version_no_version(self, mock_get_version): + """Test version validation when no version available.""" + mock_get_version.return_value = None + self.assertFalse(self.binary_manager.validate_version("0.0.6")) + + @patch.object(AksMcpBinaryManager, 'get_binary_version') + def test_validate_version_invalid_format(self, mock_get_version): + """Test version validation with invalid version format.""" + mock_get_version.return_value = "invalid-version" + self.assertFalse(self.binary_manager.validate_version("0.0.6")) + + def test_validate_version_complex_versions(self): + """Test version validation with complex version numbers.""" + with patch.object(self.binary_manager, 'get_binary_version') as mock_get_version: + # Test 4-part version + mock_get_version.return_value = "0.1.0.1" + self.assertTrue(self.binary_manager.validate_version("0.1.0.0")) + self.assertFalse(self.binary_manager.validate_version("0.2.0.0")) + + # Test equal versions + mock_get_version.return_value = "1.2.3" + self.assertTrue(self.binary_manager.validate_version("1.2.3")) + + def test_get_platform_info_linux_amd64(self): + """Test platform info detection for Linux amd64.""" + with patch('platform.system', return_value='Linux'), \ + patch('platform.machine', return_value='x86_64'): + platform_name, arch_name = self.binary_manager._get_platform_info() + self.assertEqual(platform_name, 'linux') + self.assertEqual(arch_name, 'amd64') + + def test_get_platform_info_darwin_arm64(self): + """Test platform info detection for macOS ARM64.""" + with patch('platform.system', return_value='Darwin'), \ + patch('platform.machine', return_value='arm64'): + platform_name, arch_name = self.binary_manager._get_platform_info() + self.assertEqual(platform_name, 'darwin') + self.assertEqual(arch_name, 'arm64') + + def test_get_platform_info_windows_amd64(self): + """Test platform info detection for Windows amd64.""" + with patch('platform.system', return_value='Windows'), \ + patch('platform.machine', return_value='AMD64'): + platform_name, arch_name = self.binary_manager._get_platform_info() + self.assertEqual(platform_name, 'windows') + self.assertEqual(arch_name, 'amd64') + + def test_get_platform_info_linux_aarch64(self): + """Test platform info detection for Linux aarch64.""" + with patch('platform.system', return_value='Linux'), \ + patch('platform.machine', return_value='aarch64'): + platform_name, arch_name = self.binary_manager._get_platform_info() + self.assertEqual(platform_name, 'linux') + self.assertEqual(arch_name, 'arm64') + + def test_make_binary_executable_unix(self): + """Test making binary executable on Unix-like systems.""" + if platform.system() == 'Windows': + self.skipTest("Skipping Unix test on Windows") + + # Create a test file + test_file = os.path.join(self.test_install_dir, 'test-binary') + with open(test_file, 'w') as f: + f.write('dummy content') + os.chmod(test_file, 0o644) # Read/write only + + # Make it executable + success = self.binary_manager._make_binary_executable(test_file) + self.assertTrue(success) + + # Check that it's now executable + file_mode = os.stat(test_file).st_mode + self.assertTrue(file_mode & stat.S_IEXEC) # Owner executable + self.assertTrue(file_mode & stat.S_IXGRP) # Group executable + self.assertTrue(file_mode & stat.S_IXOTH) # Others executable + + @patch('platform.system', return_value='Windows') + def test_make_binary_executable_windows(self, mock_system): + """Test making binary executable on Windows (should always succeed).""" + test_file = os.path.join(self.test_install_dir, 'test-binary.exe') + with open(test_file, 'w') as f: + f.write('dummy content') + + success = self.binary_manager._make_binary_executable(test_file) + self.assertTrue(success) + + def test_make_binary_executable_os_error(self): + """Test making binary executable when OS operations fail.""" + if platform.system() == 'Windows': + self.skipTest("Skipping Unix test on Windows") + + # Try to make a non-existent file executable + non_existent_file = os.path.join(self.test_install_dir, 'non-existent') + success = self.binary_manager._make_binary_executable(non_existent_file) + self.assertFalse(success) + + # New tests for GitHub Release API Integration + + def test_get_platform_binary_name_linux_amd64(self): + """Test platform binary name generation for Linux AMD64.""" + with patch.object(self.binary_manager, '_get_platform_info', return_value=('linux', 'amd64')): + binary_name = self.binary_manager._get_platform_binary_name() + self.assertEqual(binary_name, 'aks-mcp-linux-amd64') + + def test_get_platform_binary_name_windows_amd64(self): + """Test platform binary name generation for Windows AMD64.""" + with patch.object(self.binary_manager, '_get_platform_info', return_value=('windows', 'amd64')), \ + patch('platform.system', return_value='Windows'): + binary_name = self.binary_manager._get_platform_binary_name() + self.assertEqual(binary_name, 'aks-mcp-windows-amd64.exe') + + def test_get_platform_binary_name_darwin_arm64(self): + """Test platform binary name generation for macOS ARM64.""" + with patch.object(self.binary_manager, '_get_platform_info', return_value=('darwin', 'arm64')): + binary_name = self.binary_manager._get_platform_binary_name() + self.assertEqual(binary_name, 'aks-mcp-darwin-arm64') + + @patch('aiohttp.ClientSession') + async def test_get_latest_release_info_success(self, mock_session): + """Test successful GitHub API release info retrieval.""" + # Mock successful API response + mock_response = AsyncMock() + mock_response.status = 200 + mock_response.json = AsyncMock(return_value={ + "tag_name": "v0.1.0", + "assets": [ + {"name": "aks-mcp-linux-amd64", "browser_download_url": "https://example.com/binary"} + ] + }) + + # Create mock session with proper async context manager support + mock_session_ctx = AsyncMock() + mock_session_ctx.__aenter__ = AsyncMock(return_value=mock_session_ctx) + mock_session_ctx.__aexit__ = AsyncMock(return_value=None) + + # Create mock get response with proper async context manager support + mock_get_ctx = AsyncMock() + mock_get_ctx.__aenter__ = AsyncMock(return_value=mock_response) + mock_get_ctx.__aexit__ = AsyncMock(return_value=None) + + # Wire up the mocks + mock_session.return_value = mock_session_ctx + mock_session_ctx.get = Mock(return_value=mock_get_ctx) + + result = await self.binary_manager.get_latest_release_info() + + self.assertIsInstance(result, dict) + self.assertEqual(result["tag_name"], "v0.1.0") + self.assertIn("assets", result) + + @patch('aiohttp.ClientSession') + async def test_get_latest_release_info_http_error(self, mock_session): + """Test GitHub API release info with HTTP error.""" + # Mock HTTP error response + mock_response = AsyncMock() + mock_response.status = 404 + + # Create mock session with proper async context manager support + mock_session_ctx = AsyncMock() + mock_session_ctx.__aenter__ = AsyncMock(return_value=mock_session_ctx) + mock_session_ctx.__aexit__ = AsyncMock(return_value=None) + + # Create mock get response with proper async context manager support + mock_get_ctx = AsyncMock() + mock_get_ctx.__aenter__ = AsyncMock(return_value=mock_response) + mock_get_ctx.__aexit__ = AsyncMock(return_value=None) + + # Wire up the mocks + mock_session.return_value = mock_session_ctx + mock_session_ctx.get = Mock(return_value=mock_get_ctx) + + with self.assertRaises(Exception) as context: + await self.binary_manager.get_latest_release_info() + + self.assertIn("GitHub API request failed with status 404", str(context.exception)) + + @patch('aiohttp.ClientSession') + async def test_get_latest_release_info_network_error(self, mock_session): + """Test GitHub API release info with network error.""" + # Create mock session that raises ClientError + mock_session_ctx = AsyncMock() + mock_session_ctx.__aenter__ = AsyncMock(return_value=mock_session_ctx) + mock_session_ctx.__aexit__ = AsyncMock(return_value=None) + + # Mock get to raise aiohttp.ClientError + import aiohttp + mock_session_ctx.get = Mock(side_effect=aiohttp.ClientError("Network unreachable")) + + # Wire up the mocks + mock_session.return_value = mock_session_ctx + + with self.assertRaises(Exception) as context: + await self.binary_manager.get_latest_release_info() + + self.assertIn("Network error accessing GitHub API", str(context.exception)) + + @patch('aiohttp.ClientSession') + async def test_get_latest_release_info_json_error(self, mock_session): + """Test GitHub API release info with JSON decode error.""" + # Mock response with invalid JSON + mock_response = AsyncMock() + mock_response.status = 200 + mock_response.json = AsyncMock(side_effect=ValueError("Invalid JSON")) + + # Create mock session with proper async context manager support + mock_session_ctx = AsyncMock() + mock_session_ctx.__aenter__ = AsyncMock(return_value=mock_session_ctx) + mock_session_ctx.__aexit__ = AsyncMock(return_value=None) + + # Create mock get response with proper async context manager support + mock_get_ctx = AsyncMock() + mock_get_ctx.__aenter__ = AsyncMock(return_value=mock_response) + mock_get_ctx.__aexit__ = AsyncMock(return_value=None) + + # Wire up the mocks + mock_session.return_value = mock_session_ctx + mock_session_ctx.get = Mock(return_value=mock_get_ctx) + + with self.assertRaises(Exception): + await self.binary_manager.get_latest_release_info() + + @patch('urllib.request.urlopen') + def test_verify_binary_integrity_subject_hash(self, mock_urlopen): + """Test binary integrity verification with subject hash in attestation.""" + import json + import hashlib + + # Create a test binary file (any size) + test_file = os.path.join(self.test_install_dir, 'aks-mcp-darwin-arm64') + test_content = b'test content' + with open(test_file, 'wb') as f: + f.write(test_content) + + # Calculate the actual hash of our test content + actual_hash = hashlib.sha256(test_content).hexdigest() + + # Mock release info with attestation file + release_info = { + "assets": [ + {"name": "aks-mcp-darwin-arm64.intoto.jsonl", "browser_download_url": "https://example.com/attestation"} + ] + } + + # Mock attestation content with subject hash + attestation_dict = { + "subject": [ + { + "name": "aks-mcp-darwin-arm64", + "digest": { + "sha256": actual_hash + } + } + ] + } + attestation_content = json.dumps(attestation_dict) + + # Mock HTTP response + mock_response = Mock() + mock_response.status = 200 + mock_response.read.return_value = attestation_content.encode('utf-8') + mock_urlopen.return_value.__enter__.return_value = mock_response + + result = self.binary_manager._verify_binary_integrity(test_file, release_info) + self.assertTrue(result) + + def test_verify_binary_integrity_fallback_to_basic(self): + """Test binary integrity verification fallback when no attestation found.""" + # Create a test binary file (any size) + test_file = os.path.join(self.test_install_dir, 'aks-mcp-linux-amd64') + with open(test_file, 'wb') as f: + f.write(b'test content') + + release_info = {"assets": []} + result = self.binary_manager._verify_binary_integrity(test_file, release_info) + self.assertTrue(result) + + async def test_ensure_binary_already_available_and_valid(self): + """Test ensure_binary when binary is already available and valid.""" + with patch.object(self.binary_manager, 'is_binary_available', return_value=True), \ + patch.object(self.binary_manager, 'get_binary_version', return_value="1.0.0"), \ + patch.object(self.binary_manager, 'validate_version', return_value=True): + + status = await self.binary_manager.ensure_binary() + + self.assertTrue(status.available) + self.assertEqual(status.version, "1.0.0") + self.assertTrue(status.version_valid) + self.assertTrue(status.ready) + self.assertIsNone(status.error_message) + + async def test_ensure_binary_available_but_invalid_version(self): + """Test ensure_binary when binary is available but has invalid version.""" + mock_progress = Mock() + + with patch.object(self.binary_manager, 'is_binary_available', side_effect=[True, True]), \ + patch.object(self.binary_manager, 'get_binary_version', side_effect=["0.0.1", "1.0.0"]), \ + patch.object(self.binary_manager, 'validate_version', side_effect=[False, True]), \ + patch.object(self.binary_manager, '_create_installation_directory', return_value=True), \ + patch.object(self.binary_manager, 'download_binary', return_value=True): + + status = await self.binary_manager.ensure_binary(progress_callback=mock_progress) + + self.assertTrue(status.available) + self.assertEqual(status.version, "1.0.0") + self.assertTrue(status.version_valid) + self.assertTrue(status.ready) + self.assertIsNone(status.error_message) + + async def test_ensure_binary_not_available_download_success(self): + """Test ensure_binary when binary is not available but download succeeds.""" + mock_progress = Mock() + + with patch.object(self.binary_manager, 'is_binary_available', side_effect=[False, True]), \ + patch.object(self.binary_manager, 'get_binary_version', return_value="1.0.0"), \ + patch.object(self.binary_manager, 'validate_version', return_value=True), \ + patch.object(self.binary_manager, '_create_installation_directory', return_value=True), \ + patch.object(self.binary_manager, 'download_binary', return_value=True) as mock_download: + + status = await self.binary_manager.ensure_binary(progress_callback=mock_progress) + + self.assertTrue(status.available) + self.assertEqual(status.version, "1.0.0") + self.assertTrue(status.version_valid) + self.assertTrue(status.ready) + self.assertIsNone(status.error_message) + mock_download.assert_called_once_with(progress_callback=mock_progress) + + async def test_ensure_binary_directory_creation_fails(self): + """Test ensure_binary when directory creation fails.""" + with patch.object(self.binary_manager, 'is_binary_available', return_value=False), \ + patch.object(self.binary_manager, '_create_installation_directory', return_value=False): + + status = await self.binary_manager.ensure_binary() + + self.assertFalse(status.ready) + self.assertIn("Failed to create installation directory", status.error_message) + + async def test_ensure_binary_download_fails(self): + """Test ensure_binary when download fails.""" + with patch.object(self.binary_manager, 'is_binary_available', return_value=False), \ + patch.object(self.binary_manager, '_create_installation_directory', return_value=True), \ + patch.object(self.binary_manager, 'download_binary', return_value=False): + + status = await self.binary_manager.ensure_binary() + + self.assertFalse(status.ready) + self.assertEqual(status.error_message, "Binary download failed") + + async def test_ensure_binary_download_success_but_validation_fails(self): + """Test ensure_binary when download succeeds but validation fails.""" + with patch.object(self.binary_manager, 'is_binary_available', side_effect=[False, True]), \ + patch.object(self.binary_manager, 'get_binary_version', return_value="0.0.1"), \ + patch.object(self.binary_manager, 'validate_version', return_value=False), \ + patch.object(self.binary_manager, '_create_installation_directory', return_value=True), \ + patch.object(self.binary_manager, 'download_binary', return_value=True): + + status = await self.binary_manager.ensure_binary() + + self.assertTrue(status.available) + self.assertEqual(status.version, "0.0.1") + self.assertFalse(status.version_valid) + self.assertFalse(status.ready) + self.assertEqual(status.error_message, "Downloaded binary failed validation") + + async def test_ensure_binary_unexpected_exception(self): + """Test ensure_binary handles unexpected exceptions gracefully.""" + with patch.object(self.binary_manager, 'is_binary_available', side_effect=Exception("Unexpected error")): + + status = await self.binary_manager.ensure_binary() + + self.assertFalse(status.ready) + self.assertIn("Unexpected error during binary management", status.error_message) + self.assertIn("Unexpected error", status.error_message) + + +if __name__ == '__main__': + unittest.main() + diff --git a/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_config_generator.py b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_config_generator.py new file mode 100644 index 00000000000..b8f0eb7040f --- /dev/null +++ b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_config_generator.py @@ -0,0 +1,300 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +import unittest + +from azext_aks_agent.agent.config_generator import ConfigurationGenerator + + +class TestConfigurationGenerator(unittest.TestCase): + """Test cases for ConfigurationGenerator class.""" + + def setUp(self): + """Set up test fixtures.""" + self.base_config = { + "model": "gpt-4", + "api_key": "test-key", + "max_steps": 10, + "toolsets": { + "some-custom-toolset": {"enabled": True} + }, + "other_setting": "value" + } + + self.server_url = "http://localhost:8003/sse" + + def test_generate_mcp_config_basic(self): + """Test basic MCP configuration generation.""" + result = ConfigurationGenerator.generate_mcp_config(self.base_config, self.server_url) + + # Should preserve base config + self.assertEqual(result["model"], "gpt-4") + self.assertEqual(result["api_key"], "test-key") + self.assertEqual(result["max_steps"], 10) + self.assertEqual(result["other_setting"], "value") + + # Should add MCP server configuration + self.assertIn("mcp_servers", result) + self.assertIn("aks-mcp", result["mcp_servers"]) + + aks_mcp = result["mcp_servers"]["aks-mcp"] + self.assertEqual(aks_mcp["description"], "AKS MCP server") + self.assertEqual(aks_mcp["url"], self.server_url) + + # Should disable conflicting toolsets + toolsets = result["toolsets"] + self.assertFalse(toolsets["aks/node-health"]["enabled"]) + self.assertFalse(toolsets["aks/core"]["enabled"]) + self.assertFalse(toolsets["kubernetes/core"]["enabled"]) + self.assertFalse(toolsets["kubernetes/logs"]["enabled"]) + self.assertFalse(toolsets["kubernetes/live-metrics"]["enabled"]) + self.assertFalse(toolsets["bash"]["enabled"]) + + # Should preserve custom toolsets + self.assertTrue(toolsets["some-custom-toolset"]["enabled"]) + + def test_generate_mcp_config_empty_base(self): + """Test MCP configuration generation with empty base config.""" + result = ConfigurationGenerator.generate_mcp_config({}, self.server_url) + + # Should create MCP server configuration + self.assertIn("mcp_servers", result) + self.assertIn("aks-mcp", result["mcp_servers"]) + + # Should create toolsets configuration + self.assertIn("toolsets", result) + for toolset_name in ConfigurationGenerator.DEFAULT_CONFLICTING_TOOLSETS: + self.assertIn(toolset_name, result["toolsets"]) + self.assertFalse(result["toolsets"][toolset_name]["enabled"]) + + def test_generate_mcp_config_none_base(self): + """Test MCP configuration generation with None base config.""" + result = ConfigurationGenerator.generate_mcp_config(None, self.server_url) + + # Should handle None gracefully + self.assertIn("mcp_servers", result) + self.assertIn("toolsets", result) + + def test_generate_mcp_config_invalid_server_url(self): + """Test MCP configuration generation with invalid server URL.""" + with self.assertRaises(ValueError): + ConfigurationGenerator.generate_mcp_config(self.base_config, "") + + with self.assertRaises(ValueError): + ConfigurationGenerator.generate_mcp_config(self.base_config, None) + + def test_generate_mcp_config_preserves_original(self): + """Test that MCP config generation doesn't modify original config.""" + original_toolsets_count = len(self.base_config["toolsets"]) + + result = ConfigurationGenerator.generate_mcp_config(self.base_config, self.server_url) + + # Original should be unchanged + self.assertEqual(len(self.base_config["toolsets"]), original_toolsets_count) + self.assertNotIn("mcp_servers", self.base_config) + + # Result should be different + self.assertGreater(len(result["toolsets"]), original_toolsets_count) + self.assertIn("mcp_servers", result) + + def test_generate_traditional_config_basic(self): + """Test basic traditional configuration generation.""" + result = ConfigurationGenerator.generate_traditional_config(self.base_config) + + # Should preserve base config + self.assertEqual(result["model"], "gpt-4") + self.assertEqual(result["api_key"], "test-key") + self.assertEqual(result["max_steps"], 10) + self.assertEqual(result["other_setting"], "value") + + # Should not have MCP servers + self.assertNotIn("mcp_servers", result) + + # Should enable all default toolsets + toolsets = result["toolsets"] + for toolset_name, toolset_cfg in ConfigurationGenerator.DEFAULT_CONFLICTING_TOOLSETS.items(): + self.assertTrue(toolset_name in toolsets) + self.assertTrue(toolsets[toolset_name]["enabled"]) # enabled True + + def test_generate_traditional_config_removes_mcp(self): + """Traditional config should remove any mcp_servers section.""" + base_with_mcp = dict(self.base_config) + base_with_mcp["mcp_servers"] = {"some-server": {"url": "test"}} + + result = ConfigurationGenerator.generate_traditional_config(base_with_mcp) + + # Should remove MCP servers + self.assertNotIn("mcp_servers", result) + + def test_generate_traditional_config_empty_base(self): + """Test traditional configuration generation with empty base config.""" + result = ConfigurationGenerator.generate_traditional_config({}) + + # Should create toolsets configuration + self.assertIn("toolsets", result) + for toolset_name in ConfigurationGenerator.DEFAULT_CONFLICTING_TOOLSETS: + self.assertTrue(result["toolsets"][toolset_name]["enabled"]) + + def test_generate_traditional_config_none_base(self): + """Test traditional configuration generation with None base config.""" + result = ConfigurationGenerator.generate_traditional_config(None) + + # Should handle None gracefully + self.assertIn("toolsets", result) + + def test_merge_configs_basic(self): + """Test basic configuration merging.""" + base = {"a": 1, "b": {"x": 10, "y": 20}, "c": 3} + override = {"b": {"y": 25, "z": 30}, "d": 4} + + result = ConfigurationGenerator.merge_configs(base, override) + + # Should merge properly + self.assertEqual(result["a"], 1) # From base + self.assertEqual(result["c"], 3) # From base + self.assertEqual(result["d"], 4) # From override + + # Nested merge + self.assertEqual(result["b"]["x"], 10) # From base + self.assertEqual(result["b"]["y"], 25) # From override + self.assertEqual(result["b"]["z"], 30) # From override + + def test_merge_configs_empty_inputs(self): + """Test configuration merging with empty inputs.""" + base = {"a": 1} + + # Empty override + result = ConfigurationGenerator.merge_configs(base, {}) + self.assertEqual(result, base) + + # Empty base + result = ConfigurationGenerator.merge_configs({}, base) + self.assertEqual(result, base) + + # Both empty + result = ConfigurationGenerator.merge_configs({}, {}) + self.assertEqual(result, {}) + + # None inputs + result = ConfigurationGenerator.merge_configs(None, base) + self.assertEqual(result, base) + + result = ConfigurationGenerator.merge_configs(base, None) + self.assertEqual(result, base) + + result = ConfigurationGenerator.merge_configs(None, None) + self.assertEqual(result, {}) + + def test_merge_configs_preserves_originals(self): + """Test that merging doesn't modify original configs.""" + base = {"a": 1, "b": {"x": 10}} + override = {"b": {"y": 20}} + + original_base = dict(base) + original_override = dict(override) + + result = ConfigurationGenerator.merge_configs(base, override) + + # Originals should be unchanged + self.assertEqual(base, original_base) + self.assertEqual(override, original_override) + + # Result should be different + self.assertNotEqual(result, base) + self.assertNotEqual(result, override) + + def test_validate_mcp_config_valid(self): + """Test validation of valid MCP configuration.""" + config = ConfigurationGenerator.generate_mcp_config(self.base_config, self.server_url) + self.assertTrue(ConfigurationGenerator.validate_mcp_config(config)) + + def test_validate_mcp_config_invalid_structure(self): + """Test validation of invalid MCP configuration structures.""" + # Not a dictionary + self.assertFalse(ConfigurationGenerator.validate_mcp_config("invalid")) + self.assertFalse(ConfigurationGenerator.validate_mcp_config(None)) + + # Missing mcp_servers + config = {"toolsets": {}} + self.assertFalse(ConfigurationGenerator.validate_mcp_config(config)) + + # Invalid mcp_servers structure + config = {"mcp_servers": "invalid"} + self.assertFalse(ConfigurationGenerator.validate_mcp_config(config)) + + # Missing aks-mcp server + config = {"mcp_servers": {"other-server": {}}} + self.assertFalse(ConfigurationGenerator.validate_mcp_config(config)) + + def test_validate_mcp_config_missing_required_fields(self): + """Test validation with missing required MCP server fields.""" + base_mcp_server = { + "description": "AKS MCP server", + "url": "http://localhost:8003/sse" + } + + # Test each required field + required_fields = ["description", "url"] + for field in required_fields: + incomplete_server = dict(base_mcp_server) + del incomplete_server[field] + + config = { + "mcp_servers": {"aks-mcp": incomplete_server}, + "toolsets": {name: {"enabled": False} for name in ConfigurationGenerator.DEFAULT_CONFLICTING_TOOLSETS} + } + + self.assertFalse(ConfigurationGenerator.validate_mcp_config(config), + f"Should fail validation when missing {field}") + + def test_validate_mcp_config_enabled_conflicting_toolsets(self): + """Test validation fails with enabled conflicting toolsets.""" + config = { + "mcp_servers": { + "aks-mcp": { + "description": "AKS MCP server", + "url": "http://localhost:8003/sse" + } + }, + "toolsets": { + "aks/core": {"enabled": True} + } + } + + self.assertFalse(ConfigurationGenerator.validate_mcp_config(config)) + + def test_validate_traditional_config_valid(self): + """Test validation of valid traditional configuration.""" + config = ConfigurationGenerator.generate_traditional_config(self.base_config) + self.assertTrue(ConfigurationGenerator.validate_traditional_config(config)) + + def test_validate_traditional_config_invalid_structure(self): + """Test validation of invalid traditional configuration structures.""" + # Not a dictionary + self.assertFalse(ConfigurationGenerator.validate_traditional_config("invalid")) + self.assertFalse(ConfigurationGenerator.validate_traditional_config(None)) + + def test_validate_traditional_config_with_mcp_servers(self): + """Test validation fails for traditional config with MCP servers.""" + config = { + "mcp_servers": {"aks-mcp": {"url": "test"}}, + "toolsets": {name: {"enabled": True} for name in ConfigurationGenerator.DEFAULT_CONFLICTING_TOOLSETS} + } + + self.assertFalse(ConfigurationGenerator.validate_traditional_config(config)) + + def test_validate_traditional_config_disabled_toolsets(self): + """Test validation fails for traditional config with disabled required toolsets.""" + config = { + "toolsets": { + "aks/core": {"enabled": False} + } + } + + self.assertFalse(ConfigurationGenerator.validate_traditional_config(config)) + + +if __name__ == '__main__': + unittest.main() diff --git a/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_error_handler.py b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_error_handler.py new file mode 100644 index 00000000000..9132a931eb2 --- /dev/null +++ b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_error_handler.py @@ -0,0 +1,224 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +""" +Tests for AKS Agent Error Handler functionality. +""" + +from knack.util import CLIError + + +class TestAksAgentErrorHandler: + """Test AKS Agent Error Handler functionality.""" + + def test_agent_error_base_class(self): + """Test AgentError base class functionality.""" + from azext_aks_agent.agent.error_handler import AgentError + + # Test basic error creation + error = AgentError("Test error", "TEST_CODE", ["Suggestion 1", "Suggestion 2"]) + + assert str(error) == "Test error" + assert error.message == "Test error" + assert error.error_code == "TEST_CODE" + assert error.suggestions == ["Suggestion 1", "Suggestion 2"] + + # Test defaults + error_minimal = AgentError("Minimal error") + assert error_minimal.error_code == "GENERAL" + assert error_minimal.suggestions == [] + + def test_mcp_error_inheritance(self): + """Test MCPError extends AgentError with default suggestions.""" + from azext_aks_agent.agent.error_handler import MCPError + + error = MCPError("MCP failed") + + assert error.error_code == "MCP" + assert len(error.suggestions) > 0 + assert any("--no-aks-mcp" in suggestion for suggestion in error.suggestions) + assert any("internet connection" in suggestion.lower() for suggestion in error.suggestions) + + def test_binary_error_inheritance(self): + """Test BinaryError extends MCPError with specific suggestions.""" + from azext_aks_agent.agent.error_handler import BinaryError + + error = BinaryError("Binary download failed", "BINARY_TEST", ["Custom suggestion"]) + + assert error.error_code == "BINARY_TEST" + # Should have both custom and default suggestions + assert "Custom suggestion" in error.suggestions + assert any("internet connectivity" in suggestion.lower() for suggestion in error.suggestions) + + def test_server_error_inheritance(self): + """Test ServerError extends MCPError with server-specific suggestions.""" + from azext_aks_agent.agent.error_handler import ServerError + + error = ServerError("Server startup failed") + + assert error.error_code == "SERVER" + assert any("port" in suggestion.lower() for suggestion in error.suggestions) + assert any("execute permissions" in suggestion.lower() for suggestion in error.suggestions) + + def test_configuration_error_inheritance(self): + """Test ConfigurationError extends AgentError with config-specific suggestions.""" + from azext_aks_agent.agent.error_handler import ConfigurationError + + error = ConfigurationError("Invalid config") + + assert error.error_code == "CONFIG" + assert any("YAML" in suggestion for suggestion in error.suggestions) + assert any("configuration" in suggestion.lower() for suggestion in error.suggestions) + + def test_error_message_formatting(self): + """Test error message formatting functionality.""" + from azext_aks_agent.agent.error_handler import AgentErrorHandler, MCPError + + # Test with AgentError + error = MCPError("Test MCP error", "TEST_MCP", ["Suggestion 1", "Suggestion 2"]) + formatted = AgentErrorHandler.format_error_message(error, show_suggestions=True) + + assert "AKS Agent Error (TEST_MCP): Test MCP error" in formatted + assert "Suggestions:" in formatted + assert "1. Suggestion 1" in formatted + assert "2. Suggestion 2" in formatted + + # Test without suggestions + formatted_no_suggestions = AgentErrorHandler.format_error_message(error, show_suggestions=False) + assert "AKS Agent Error (TEST_MCP): Test MCP error" in formatted_no_suggestions + assert "Suggestions:" not in formatted_no_suggestions + + # Test with non-AgentError + regular_error = Exception("Regular error") + formatted_regular = AgentErrorHandler.format_error_message(regular_error) + assert "AKS Agent Error: Regular error" in formatted_regular + + def test_cli_error_creation(self): + """Test CLIError creation from AgentError.""" + from azext_aks_agent.agent.error_handler import AgentErrorHandler, MCPError + + agent_error = MCPError("Test error", "TEST", ["Test suggestion"]) + cli_error = AgentErrorHandler.create_cli_error(agent_error) + + assert isinstance(cli_error, CLIError) + assert "AKS Agent Error (TEST): Test error" in str(cli_error) + assert "Test suggestion" in str(cli_error) + + def test_mcp_setup_error_handling(self): + """Test MCP setup error handling with context-specific guidance.""" + from azext_aks_agent.agent.error_handler import AgentErrorHandler, MCPError, BinaryError, ServerError + + # Simulate network error + original_error = ConnectionError("Network failure") + mcp_error = AgentErrorHandler.handle_mcp_setup_error(original_error, "initialization") + + assert isinstance(mcp_error, MCPError) + assert "MCP setup failed during initialization" in str(mcp_error) + assert any("internet" in s.lower() for s in mcp_error.suggestions) + + # Binary error cases + download_error = Exception("Download failed") + binary_error = AgentErrorHandler.handle_binary_error(download_error, "download") + + assert isinstance(binary_error, BinaryError) + assert "Binary download failed" in str(binary_error) + assert binary_error.error_code == "BINARY_DOWNLOAD" + assert any("internet connectivity" in suggestion.lower() for suggestion in binary_error.suggestions) + + # Validation + validation_error = Exception("Checksum mismatch") + binary_error_val = AgentErrorHandler.handle_binary_error(validation_error, "validation") + + assert binary_error_val.error_code == "BINARY_VALIDATION" + assert any("corrupted" in suggestion.lower() for suggestion in binary_error_val.suggestions) + + # Execution + execution_error = Exception("Permission denied") + binary_error_exec = AgentErrorHandler.handle_binary_error(execution_error, "execution") + + assert binary_error_exec.error_code == "BINARY_EXECUTION" + assert any("execute permissions" in suggestion.lower() for suggestion in binary_error_exec.suggestions) + + def test_server_error_handling(self): + """Test server error handling with operation-specific guidance.""" + from azext_aks_agent.agent.error_handler import AgentErrorHandler, ServerError + + # Startup + startup_error = Exception("Startup failed") + server_error = AgentErrorHandler.handle_server_error(startup_error, "startup") + + assert isinstance(server_error, ServerError) + assert "MCP server startup failed" in str(server_error) + assert server_error.error_code == "SERVER_STARTUP" + assert any("binary is available" in suggestion.lower() for suggestion in server_error.suggestions) + + # Health check + health_error = Exception("Health check timeout") + server_error_health = AgentErrorHandler.handle_server_error(health_error, "health_check") + + assert server_error_health.error_code == "SERVER_HEALTH_CHECK" + assert any("automatically restarted" in suggestion.lower() for suggestion in server_error_health.suggestions) + + # Communication + comm_error = Exception("Connection refused") + server_error_comm = AgentErrorHandler.handle_server_error(comm_error, "communication") + + assert server_error_comm.error_code == "SERVER_COMMUNICATION" + assert any("still running" in suggestion.lower() for suggestion in server_error_comm.suggestions) + + def test_context_error_creation(self): + """Test AKS context validation error creation.""" + from azext_aks_agent.agent.error_handler import AgentErrorHandler, AgentError + + context_info = { + "cluster_name": "test-cluster", + "resource_group": "test-rg", + "subscription_id": "test-sub-id" + } + + context_error = AgentErrorHandler.create_context_error(context_info) + + assert isinstance(context_error, AgentError) + assert context_error.error_code == "CONTEXT_VALIDATION" + assert "test-cluster" in str(context_error) + assert "test-rg" in str(context_error) + assert "test-sub-id" in str(context_error) + assert any("--name " in suggestion for suggestion in context_error.suggestions) + assert any("az login" in suggestion for suggestion in context_error.suggestions) + + # Test with None values + context_info_none = { + "cluster_name": None, + "resource_group": None, + "subscription_id": None + } + + context_error_none = AgentErrorHandler.create_context_error(context_info_none) + assert "None" in str(context_error_none) + + def test_error_codes_uniqueness(self): + """Test that error codes are unique and descriptive.""" + from azext_aks_agent.agent.error_handler import ( + MCPError, BinaryError, ServerError, ConfigurationError, AgentErrorHandler + ) + + # Different error types + mcp_error = MCPError("Test") + binary_error = BinaryError("Test") + server_error = ServerError("Test") + config_error = ConfigurationError("Test") + + error_codes = {mcp_error.error_code, binary_error.error_code, + server_error.error_code, config_error.error_code} + + assert len(error_codes) >= 3 + + # Handler-generated errors have specific codes + binary_download = AgentErrorHandler.handle_binary_error(Exception(), "download") + binary_validation = AgentErrorHandler.handle_binary_error(Exception(), "validation") + + assert binary_download.error_code == "BINARY_DOWNLOAD" + assert binary_validation.error_code == "BINARY_VALIDATION" + diff --git a/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_mcp_integration.py b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_mcp_integration.py new file mode 100644 index 00000000000..4ac429949b1 --- /dev/null +++ b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_mcp_integration.py @@ -0,0 +1,291 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +""" +Tests for AKS Agent MCP integration functionality. +""" + +import asyncio +from unittest.mock import AsyncMock, Mock, patch +import pytest + + +class TestAksAgentMCPIntegration: + """Test AKS Agent MCP integration functionality.""" + + def setup_method(self): + """Set up test fixtures.""" + self.mock_cmd = Mock() + self.mock_cmd.cli_ctx = Mock() + self.test_config_file = "~/.azure/aksAgent.config" + self.test_model = "gpt-4" + self.test_api_key = "test-key" + self.test_max_steps = 10 + + def test_initialize_mcp_manager_success(self): + """Test successful MCP manager initialization.""" + from azext_aks_agent.agent.agent import _initialize_mcp_manager + + with patch('azext_aks_agent.agent.mcp_manager.MCPManager') as mock_mcp_class: + mock_manager = Mock() + mock_mcp_class.return_value = mock_manager + + result = _initialize_mcp_manager(verbose=True) + + assert result == mock_manager + mock_mcp_class.assert_called_once_with(verbose=True) + + def test_initialize_mcp_manager_import_error(self): + """Test MCP manager initialization with import error.""" + from azext_aks_agent.agent.agent import _initialize_mcp_manager + from azext_aks_agent.agent.error_handler import MCPError + + with patch('azext_aks_agent.agent.mcp_manager.MCPManager', side_effect=ImportError("Module not found")): + with pytest.raises(MCPError) as exc_info: + _initialize_mcp_manager() + + assert "MCP manager initialization failed" in str(exc_info.value) + assert exc_info.value.error_code == "MCP_IMPORT" + assert "Ensure all required dependencies are installed" in exc_info.value.suggestions[0] + + @pytest.mark.asyncio + async def test_setup_mcp_mode_basic_workflow(self): + """Test basic MCP mode setup workflow without complex mocking.""" + from azext_aks_agent.agent.agent import _setup_mcp_mode + from azext_aks_agent.agent.binary_manager import BinaryStatus + + # Create a simple mock manager + mock_manager = Mock() + mock_manager.is_binary_available.return_value = True + mock_manager.validate_binary_version.return_value = True + mock_manager.start_server = AsyncMock(return_value=True) + mock_manager.get_server_url.return_value = "http://localhost:8003/sse" + + # Mock binary status + mock_binary_status = BinaryStatus(available=True, version_valid=True) + mock_manager.binary_manager.ensure_binary = AsyncMock(return_value=mock_binary_status) + + # Test with a non-existent config file (will use empty config) + with patch('pathlib.Path.exists', return_value=False), \ + patch('tempfile.NamedTemporaryFile') as mock_temp_file, \ + patch('yaml.dump') as mock_yaml_dump, \ + patch('os.unlink'): + + # Mock the temporary file context manager + mock_temp_file.return_value.__enter__.return_value.name = "/tmp/test_config.yaml" + + # This should fail because we haven't mocked Holmes Config.load_from_file, + # but that's expected - we're just testing the workflow doesn't crash + try: + await _setup_mcp_mode( + mock_manager, "nonexistent_config.yaml", "gpt-4", + "test-key", 10, verbose=False + ) + except Exception as e: + # Expected to fail at Config.load_from_file + assert "Config" in str(e) or "load_from_file" in str(e) or "ImportError" in str(e) + + # Verify the manager methods were called correctly + mock_manager.start_server.assert_called_once() + assert mock_manager.get_server_url.called + + # Check the content of the configuration that was passed to yaml.dump + if mock_yaml_dump.call_count > 0: + config_data = mock_yaml_dump.call_args[0][0] + + # Verify MCP server configuration is present + assert "mcp_servers" in config_data + assert "aks-mcp" in config_data["mcp_servers"] + assert config_data["mcp_servers"]["aks-mcp"]["url"] == "http://localhost:8003/sse" + + # Verify conflicting toolsets are disabled + assert "toolsets" in config_data + toolsets = config_data["toolsets"] + assert toolsets["aks/core"]["enabled"] is False + assert toolsets["kubernetes/core"]["enabled"] is False + + @pytest.mark.asyncio + async def test_setup_mcp_mode_binary_not_available(self): + """Test MCP mode setup when binary is not available and download fails.""" + from azext_aks_agent.agent.agent import _setup_mcp_mode + from azext_aks_agent.agent.binary_manager import BinaryStatus + from azext_aks_agent.agent.error_handler import BinaryError + + # Setup mocks + mock_manager = Mock() + mock_manager.is_binary_available.return_value = False + mock_manager.validate_binary_version.return_value = False + + # Mock failed binary download + mock_binary_status = BinaryStatus(available=False, error_message="Download failed") + mock_manager.binary_manager.ensure_binary = AsyncMock(return_value=mock_binary_status) + + # Test the function + with pytest.raises(BinaryError) as exc_info: + await _setup_mcp_mode( + mock_manager, self.test_config_file, self.test_model, + self.test_api_key, self.test_max_steps, verbose=True + ) + + assert "Binary setup failed" in str(exc_info.value) + assert exc_info.value.error_code == "BINARY_SETUP" + + @pytest.mark.asyncio + async def test_setup_mcp_mode_server_start_failure(self): + """Test MCP mode setup when server fails to start.""" + from azext_aks_agent.agent.agent import _setup_mcp_mode + from azext_aks_agent.agent.binary_manager import BinaryStatus + from azext_aks_agent.agent.error_handler import ServerError + + # Setup mocks + mock_manager = Mock() + mock_manager.is_binary_available.return_value = True + mock_manager.validate_binary_version.return_value = True + mock_manager.start_server = AsyncMock(return_value=False) + + mock_binary_status = BinaryStatus(available=True, version_valid=True) + mock_manager.binary_manager.ensure_binary = AsyncMock(return_value=mock_binary_status) + + # Test the function + with pytest.raises(ServerError) as exc_info: + await _setup_mcp_mode( + mock_manager, self.test_config_file, self.test_model, + self.test_api_key, self.test_max_steps, verbose=True + ) + + assert "Server startup failed" in str(exc_info.value) + assert exc_info.value.error_code == "SERVER_STARTUP" + + def test_error_handler_functionality(self): + """Test the enhanced error handling system.""" + from azext_aks_agent.agent.error_handler import ( + AgentErrorHandler, MCPError, BinaryError, ServerError + ) + + # Test MCP setup error handling + original_error = ConnectionError("Network connection failed") + mcp_error = AgentErrorHandler.handle_mcp_setup_error(original_error, "initialization") + + assert isinstance(mcp_error, MCPError) + assert "MCP setup failed during initialization" in str(mcp_error) + assert mcp_error.error_code == "MCP_SETUP" + assert "Check your internet connection" in mcp_error.suggestions + + # Test binary error handling + binary_error = AgentErrorHandler.handle_binary_error( + Exception("Download timeout"), "download" + ) + + assert isinstance(binary_error, BinaryError) + assert "Binary download failed" in str(binary_error) + assert binary_error.error_code == "BINARY_DOWNLOAD" + assert "Verify you have internet connectivity" in binary_error.suggestions + + # Test server error handling + server_error = AgentErrorHandler.handle_server_error( + Exception("Port in use"), "startup" + ) + + assert isinstance(server_error, ServerError) + assert "MCP server startup failed" in str(server_error) + assert server_error.error_code == "SERVER_STARTUP" + assert "Check if the MCP binary is available and executable" in server_error.suggestions + + # Test error message formatting + formatted_message = AgentErrorHandler.format_error_message(mcp_error) + assert "AKS Agent Error (MCP_SETUP)" in formatted_message + assert "Suggestions:" in formatted_message + assert "Try running with --no-aks-mcp" in formatted_message + + def test_setup_traditional_mode_config_loading(self): + """Test traditional mode setup with actual config loading.""" + import tempfile + import yaml + from azext_aks_agent.agent.config_generator import ConfigurationGenerator + + # Create a temporary config file + test_config = { + "existing": "config", + "toolsets": { + "custom/toolset": {"enabled": True} + } + } + + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f: + yaml.dump(test_config, f) + config_file_path = f.name + + try: + # Test loading and processing config as dictionary + from pathlib import Path + + expanded_config_file = Path(config_file_path) + base_config_dict = {} + + if expanded_config_file.exists(): + with open(expanded_config_file, 'r') as f: + base_config_dict = yaml.safe_load(f) or {} + + # Use ConfigurationGenerator to create traditional config + traditional_config_dict = ConfigurationGenerator.generate_traditional_config(base_config_dict) + + # Verify the configuration was processed correctly + assert "toolsets" in traditional_config_dict + assert "existing" in traditional_config_dict + assert traditional_config_dict["existing"] == "config" + + # Verify traditional toolsets are enabled + toolsets = traditional_config_dict["toolsets"] + assert toolsets["aks/core"]["enabled"] is True + assert toolsets["kubernetes/core"]["enabled"] is True + assert toolsets["kubernetes/live-metrics"]["enabled"] is True + assert toolsets["custom/toolset"]["enabled"] is True + + # Verify no MCP servers are configured + assert "mcp_servers" not in traditional_config_dict + + finally: + Path(config_file_path).unlink() # Clean up temp file + + @patch('sys.stdin') + def test_aks_agent_calls_sync_implementation(self, mock_stdin): + """Test that aks_agent works with new synchronous implementation.""" + from azext_aks_agent.agent.agent import aks_agent + + # Mock stdin to avoid pytest capture issues + mock_stdin.isatty.return_value = True # No piped input + + # Call the function with no_aks_mcp=True to avoid MCP setup + try: + aks_agent( + self.mock_cmd, "rg", "cluster", "test prompt", self.test_model, + self.test_api_key, self.test_max_steps, self.test_config_file, + False, False, False, False, True # no_aks_mcp=True + ) + except Exception as e: + # Expected to fail due to missing Holmes dependencies in test environment + # But it should fail gracefully without asyncio.run() errors + assert "cannot be called from a running event loop" not in str(e) + + @patch('sys.stdin') + def test_python_version_check(self, mock_stdin): + """Test that agent checks Python version requirement.""" + from azext_aks_agent.agent.agent import aks_agent + from knack.util import CLIError + + # Mock stdin to avoid pytest capture issues + mock_stdin.isatty.return_value = True # No piped input + + with patch('azext_aks_agent.agent.agent.sys') as mock_sys: + mock_sys.version_info = (3, 9, 0) # Below required version + + with pytest.raises(CLIError) as exc_info: + aks_agent( + self.mock_cmd, "rg", "cluster", "test prompt", self.test_model, + self.test_api_key, self.test_max_steps, self.test_config_file, + False, False, False, False, True # no_aks_mcp=True + ) + + assert "upgrade the python version to 3.10" in str(exc_info.value) diff --git a/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_parameters.py b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_parameters.py new file mode 100644 index 00000000000..fe881f48d57 --- /dev/null +++ b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_parameters.py @@ -0,0 +1,72 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +""" +Tests for AKS Agent parameter parsing and validation. +""" + +import unittest + + +class TestAksAgentParameters(unittest.TestCase): + """Test AKS Agent parameter parsing and functionality.""" + + def test_agent_function_signature_includes_no_aks_mcp(self): + """Test that the agent function includes no_aks_mcp parameter.""" + import inspect + from azext_aks_agent.agent.agent import aks_agent + + # Get function signature + sig = inspect.signature(aks_agent) + + # Verify no_aks_mcp parameter exists + self.assertIn("no_aks_mcp", sig.parameters) + + # Verify default value is False + param = sig.parameters["no_aks_mcp"] + self.assertEqual(param.default, False) + + def test_custom_function_signature_includes_no_aks_mcp(self): + """Test that the custom.py function includes no_aks_mcp parameter.""" + import inspect + from azext_aks_agent.custom import aks_agent + + # Get function signature + sig = inspect.signature(aks_agent) + + # Verify no_aks_mcp parameter exists + self.assertIn("no_aks_mcp", sig.parameters) + + # Verify default value is False + param = sig.parameters["no_aks_mcp"] + self.assertEqual(param.default, False) + + def test_parameter_boolean_type(self): + """Test that no_aks_mcp parameter behaves as a boolean flag.""" + # Test default value behavior + import inspect + from azext_aks_agent.agent.agent import aks_agent + + sig = inspect.signature(aks_agent) + param = sig.parameters["no_aks_mcp"] + + # Should have a default value of False + self.assertIsInstance(param.default, bool) + self.assertFalse(param.default) + + def test_parameter_docstring_updated(self): + """Test that the function docstring includes the new parameter.""" + from azext_aks_agent.agent.agent import aks_agent + + # Check if docstring mentions the new parameter + docstring = aks_agent.__doc__ + self.assertIsNotNone(docstring, "Function should have a docstring") + self.assertIn("no_aks_mcp", docstring, "Docstring should mention no_aks_mcp parameter") + self.assertIn("MCP", docstring, "Docstring should mention MCP") + + +if __name__ == '__main__': + unittest.main() + diff --git a/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_smart_refresh.py b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_smart_refresh.py new file mode 100644 index 00000000000..ce3c531461e --- /dev/null +++ b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_smart_refresh.py @@ -0,0 +1,218 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +""" +Tests for AKS Agent Smart Refresh Strategy functionality. +""" + +import os +import tempfile +import unittest +from unittest.mock import Mock, patch, mock_open + + +class TestAksAgentSmartRefresh(unittest.TestCase): + """Test AKS Agent Smart Refresh Strategy functionality.""" + + def setUp(self): + """Set up test fixtures.""" + self.mock_cmd = Mock() + self.mock_cmd.cli_ctx = Mock() + self.test_config_file = "~/.azure/aksAgent.config" + self.test_model = "gpt-4" + self.test_api_key = "test-key" + self.test_max_steps = 10 + + # Create temporary state file for testing + self.temp_dir = tempfile.mkdtemp() + self.test_state_file = os.path.join(self.temp_dir, "aks_agent_mode_state") + + def tearDown(self): + """Clean up test fixtures.""" + # Clean up temporary files + if os.path.exists(self.test_state_file): + os.unlink(self.test_state_file) + os.rmdir(self.temp_dir) + + @patch('azext_aks_agent.agent.agent.get_config_dir') + def test_get_mode_state_file(self, mock_get_config_dir): + """Test getting the mode state file path.""" + from azext_aks_agent.agent.agent import _get_mode_state_file + + mock_get_config_dir.return_value = "/test/config" + result = _get_mode_state_file() + + self.assertEqual(result, "/test/config/aks_agent_mode_state") + mock_get_config_dir.assert_called_once() + + @patch('azext_aks_agent.agent.agent._get_mode_state_file') + def test_get_last_mode_unknown(self, mock_get_state_file): + """Test getting last mode when no state file exists.""" + from azext_aks_agent.agent.agent import _get_last_mode + + mock_get_state_file.return_value = "/nonexistent/state/file" + result = _get_last_mode() + + self.assertEqual(result, "unknown") + + @patch('azext_aks_agent.agent.agent._get_mode_state_file') + def test_get_last_mode_valid(self, mock_get_state_file): + """Test getting last mode with valid state file.""" + from azext_aks_agent.agent.agent import _get_last_mode + + mock_get_state_file.return_value = self.test_state_file + + # Create state file with 'mcp' mode + with open(self.test_state_file, 'w') as f: + f.write("mcp") + + result = _get_last_mode() + self.assertEqual(result, "mcp") + + @patch('azext_aks_agent.agent.agent._get_mode_state_file') + def test_get_last_mode_invalid_content(self, mock_get_state_file): + """Test getting last mode with invalid state file content.""" + from azext_aks_agent.agent.agent import _get_last_mode + + mock_get_state_file.return_value = self.test_state_file + + # Create state file with invalid content + with open(self.test_state_file, 'w') as f: + f.write("invalid_mode") + + result = _get_last_mode() + self.assertEqual(result, "unknown") + + @patch('azext_aks_agent.agent.agent._get_mode_state_file') + def test_get_last_mode_io_error(self, mock_get_state_file): + """Test getting last mode handles IO errors gracefully.""" + from azext_aks_agent.agent.agent import _get_last_mode + + mock_get_state_file.return_value = self.test_state_file + + # Create state file with restricted permissions + with open(self.test_state_file, 'w') as f: + f.write("mcp") + os.chmod(self.test_state_file, 0o000) # No permissions + + try: + result = _get_last_mode() + self.assertEqual(result, "unknown") + finally: + # Restore permissions for cleanup + os.chmod(self.test_state_file, 0o644) + + @patch('azext_aks_agent.agent.agent._get_mode_state_file') + def test_save_current_mode_success(self, mock_get_state_file): + """Test saving current mode successfully.""" + from azext_aks_agent.agent.agent import _save_current_mode + + mock_get_state_file.return_value = self.test_state_file + + _save_current_mode("mcp") + + # Verify the file was written correctly + with open(self.test_state_file, 'r') as f: + content = f.read().strip() + self.assertEqual(content, "mcp") + + @patch('azext_aks_agent.agent.agent._get_mode_state_file') + def test_save_current_mode_invalid_mode(self, mock_get_state_file): + """Test saving invalid mode does nothing.""" + from azext_aks_agent.agent.agent import _save_current_mode + + mock_get_state_file.return_value = self.test_state_file + + _save_current_mode("invalid_mode") + + # Verify no file was created + self.assertFalse(os.path.exists(self.test_state_file)) + + @patch('azext_aks_agent.agent.agent._get_mode_state_file') + @patch('os.makedirs') + def test_save_current_mode_creates_directory(self, mock_makedirs, mock_get_state_file): + """Test saving mode creates directory if needed.""" + from azext_aks_agent.agent.agent import _save_current_mode + + state_file_path = "/test/new/dir/aks_agent_mode_state" + mock_get_state_file.return_value = state_file_path + + with patch('builtins.open', mock_open()) as mock_file: + _save_current_mode("traditional") + + # Verify directory creation was attempted + mock_makedirs.assert_called_once_with("/test/new/dir", exist_ok=True) + mock_file.assert_called_once_with(state_file_path, 'w') + + @patch('azext_aks_agent.agent.agent._get_last_mode') + def test_should_refresh_toolsets_first_run(self, mock_get_last_mode): + """Test refresh decision on first run (unknown state).""" + from azext_aks_agent.agent.agent import _should_refresh_toolsets + + mock_get_last_mode.return_value = "unknown" + + result = _should_refresh_toolsets("mcp", False) + self.assertTrue(result) + + @patch('azext_aks_agent.agent.agent._get_last_mode') + def test_should_refresh_toolsets_mode_transition(self, mock_get_last_mode): + """Test refresh decision on mode transition.""" + from azext_aks_agent.agent.agent import _should_refresh_toolsets + + mock_get_last_mode.return_value = "traditional" + + # Switching from traditional to MCP + result = _should_refresh_toolsets("mcp", False) + self.assertTrue(result) + + @patch('azext_aks_agent.agent.agent._get_last_mode') + def test_should_refresh_toolsets_same_mode(self, mock_get_last_mode): + """Test refresh decision when staying in same mode.""" + from azext_aks_agent.agent.agent import _should_refresh_toolsets + + mock_get_last_mode.return_value = "mcp" + + # Staying in MCP mode + result = _should_refresh_toolsets("mcp", False) + self.assertFalse(result) + + @patch('azext_aks_agent.agent.agent._get_last_mode') + def test_should_refresh_toolsets_user_request(self, mock_get_last_mode): + """Test refresh decision honors explicit user request.""" + from azext_aks_agent.agent.agent import _should_refresh_toolsets + + mock_get_last_mode.return_value = "mcp" + + # User explicitly requested refresh, even in same mode + result = _should_refresh_toolsets("mcp", True) + self.assertTrue(result) + + def test_should_refresh_toolsets_all_scenarios(self): + """Test all combinations of refresh decision logic.""" + from azext_aks_agent.agent.agent import _should_refresh_toolsets + + test_cases = [ + # (last_mode, requested_mode, user_refresh, expected_result, description) + ("unknown", "mcp", False, True, "First run - MCP"), + ("unknown", "traditional", False, True, "First run - Traditional"), + ("mcp", "mcp", False, False, "Same mode - MCP"), + ("traditional", "traditional", False, False, "Same mode - Traditional"), + ("mcp", "traditional", False, True, "Mode transition - MCP to Traditional"), + ("traditional", "mcp", False, True, "Mode transition - Traditional to MCP"), + ("mcp", "mcp", True, True, "User request - Same mode MCP"), + ("traditional", "traditional", True, True, "User request - Same mode Traditional"), + ("mcp", "traditional", True, True, "User request - Mode transition"), + ] + + for last_mode, requested_mode, user_refresh, expected, description in test_cases: + with self.subTest(description=description): + with patch('azext_aks_agent.agent.agent._get_last_mode', return_value=last_mode): + result = _should_refresh_toolsets(requested_mode, user_refresh) + self.assertEqual(result, expected, f"Failed for {description}") + + +if __name__ == '__main__': + unittest.main() + diff --git a/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_status.py b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_status.py new file mode 100644 index 00000000000..54a98f93ddc --- /dev/null +++ b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_status.py @@ -0,0 +1,390 @@ +""" +Unit tests for agent status collection functionality. +""" + +import os +import json +import tempfile +import pytest +from unittest.mock import Mock, patch +from datetime import datetime, timedelta + +from azext_aks_agent.agent.status import AgentStatusManager +from azext_aks_agent.agent.status_models import AgentStatus, BinaryStatus, ServerStatus, ConfigStatus + + +class TestAgentStatusManager: + """Test cases for AgentStatusManager.""" + + def setup_method(self): + """Set up test fixtures.""" + self.temp_dir = tempfile.mkdtemp() + self.status_manager = AgentStatusManager(config_dir=self.temp_dir) + + def teardown_method(self): + """Clean up test fixtures.""" + import shutil + shutil.rmtree(self.temp_dir, ignore_errors=True) + + def test_init_with_default_config_dir(self): + """Test initialization with default config directory.""" + with patch('azext_aks_agent.agent.status.get_config_dir') as mock_get_config_dir: + mock_get_config_dir.return_value = '/mock/config/dir' + + manager = AgentStatusManager() + + assert manager.config_dir == '/mock/config/dir' + mock_get_config_dir.assert_called_once() + + def test_init_with_custom_config_dir(self): + """Test initialization with custom config directory.""" + custom_dir = '/custom/config/dir' + manager = AgentStatusManager(config_dir=custom_dir) + + assert manager.config_dir == custom_dir + + @pytest.mark.asyncio + @patch('azext_aks_agent.agent.status.psutil') + async def test_get_status_success(self, mock_psutil): + """Test successful status collection.""" + # Mock binary manager + with patch.object(self.status_manager.binary_manager, 'get_binary_path') as mock_path, \ + patch.object(self.status_manager.binary_manager, 'get_binary_version') as mock_version, \ + patch.object(self.status_manager.binary_manager, 'validate_version') as mock_validate, \ + patch('os.path.exists') as mock_exists: + + mock_path.return_value = '/mock/binary/path' + mock_version.return_value = '1.0.0' + mock_validate.return_value = True + mock_exists.return_value = True + + # Mock process info + mock_process = Mock() + mock_process.create_time.return_value = datetime.now().timestamp() - 3600 # 1 hour ago + mock_psutil.Process.return_value = mock_process + + # Mock file stats + with patch('os.stat') as mock_stat, \ + patch('os.path.getmtime') as mock_getmtime: + + mock_stat.return_value.st_size = 1024 + mock_stat.return_value.st_mtime = datetime.now().timestamp() + mock_getmtime.return_value = datetime.now().timestamp() + + status = await self.status_manager.get_status() + + assert isinstance(status, AgentStatus) + assert isinstance(status.mcp_binary, BinaryStatus) + assert isinstance(status.server, ServerStatus) + assert isinstance(status.config, ConfigStatus) + + @pytest.mark.asyncio + async def test_get_status_with_error(self): + """Test status collection with error.""" + # Mock determine_current_mode to raise exception + with patch.object(self.status_manager, '_determine_current_mode', side_effect=Exception("Test error")): + + status = await self.status_manager.get_status() + + assert status.mode == "error" + assert "Status collection failed" in status.error_message + + def test_get_mcp_binary_status_available(self): + """Test MCP binary status when binary is available.""" + with patch.object(self.status_manager.binary_manager, 'get_binary_path') as mock_path, \ + patch.object(self.status_manager.binary_manager, 'get_binary_version') as mock_version, \ + patch.object(self.status_manager.binary_manager, 'validate_version') as mock_validate, \ + patch('os.path.exists') as mock_exists, \ + patch('azext_aks_agent.agent.status_models.BinaryStatus.from_file_path') as mock_from_path: + + mock_path.return_value = '/mock/binary/path' + mock_version.return_value = '1.0.0' + mock_validate.return_value = True + mock_exists.return_value = True + + expected_status = BinaryStatus(available=True, version='1.0.0', version_valid=True) + mock_from_path.return_value = expected_status + + result = self.status_manager._get_mcp_binary_status() + + assert result == expected_status + mock_from_path.assert_called_once_with('/mock/binary/path', version='1.0.0', version_valid=True) + + def test_get_mcp_binary_status_not_available(self): + """Test MCP binary status when binary is not available.""" + with patch.object(self.status_manager.binary_manager, 'get_binary_path', return_value='/mock/bin'), \ + patch('os.path.exists', return_value=False): + + result = self.status_manager._get_mcp_binary_status() + + assert not result.available + assert result.path == '/mock/bin' + assert result.error_message == 'Binary not found' + + @pytest.mark.asyncio + @patch('azext_aks_agent.agent.status.MCPManager') + @patch('azext_aks_agent.agent.status.psutil') + async def test_get_server_status_running_healthy(self, mock_psutil, mock_mcp_manager_class): + """Test server status when server is running and healthy.""" + # Mock MCP manager instance + mock_manager = Mock() + mock_manager.is_server_running.return_value = True + mock_manager.is_server_healthy.return_value = True + mock_manager.get_server_url.return_value = 'http://localhost:8003/sse' + mock_manager.get_server_port.return_value = 8003 + mock_manager.server_process = Mock() + mock_manager.server_process.pid = 12345 + + mock_mcp_manager_class.return_value = mock_manager + + # Mock process info + mock_process = Mock() + start_time = datetime.now() - timedelta(hours=1) + mock_process.create_time.return_value = start_time.timestamp() + mock_psutil.Process.return_value = mock_process + + result = await self.status_manager._get_server_status() + + assert result.running + assert result.healthy + assert result.url == 'http://localhost:8003/sse' + assert result.port == 8003 + assert result.pid == 12345 + assert result.uptime is not None + + @pytest.mark.asyncio + @patch('azext_aks_agent.agent.status.MCPManager') + async def test_get_server_status_not_running(self, mock_mcp_manager_class): + """Test server status when server is not running.""" + # Mock MCP manager instance + mock_manager = Mock() + mock_manager.is_server_running.return_value = False + mock_mcp_manager_class.return_value = mock_manager + + result = await self.status_manager._get_server_status() + + assert not result.running + assert not result.healthy + assert result.url is None + assert result.port is None + assert result.pid is None + + @pytest.mark.asyncio + @patch('azext_aks_agent.agent.status.MCPManager') + async def test_get_server_status_with_exception(self, mock_mcp_manager_class): + """Test server status collection with exception.""" + mock_mcp_manager_class.side_effect = Exception("Test error") + + result = await self.status_manager._get_server_status() + + assert not result.running + assert not result.healthy + assert "Server status check failed" in result.error_message + + def test_get_configuration_status_mcp_mode(self): + """Test configuration status in MCP mode.""" + # Create mock state file + state_file_path = os.path.join(self.temp_dir, "aks_agent_mode_state") + with open(state_file_path, 'w') as f: + json.dump({"last_mode": "mcp"}, f) + + # Create mock config file + config_file_path = os.path.join(self.temp_dir, "aksAgent.yaml") + config_data = { + "toolsets": { + "aks/core": {"enabled": False}, + "kubernetes/core": {"enabled": False} + }, + "mcp_servers": { + "aks-mcp": {"url": "http://localhost:8003/sse"} + } + } + with open(config_file_path, 'w') as f: + json.dump(config_data, f) + + with patch('azext_aks_agent.agent.status.ConfigurationGenerator.validate_mcp_config') as mock_validate: + mock_validate.return_value = True + + result = self.status_manager._get_configuration_status() + + assert result.mode == "mcp" + assert result.config_valid + assert len(result.mcp_servers) == 1 + assert "aks-mcp" in result.mcp_servers + + def test_get_configuration_status_traditional_mode(self): + """Test configuration status in traditional mode.""" + # Create mock state file + state_file_path = os.path.join(self.temp_dir, "aks_agent_mode_state") + with open(state_file_path, 'w') as f: + json.dump({"last_mode": "traditional"}, f) + + with patch('azext_aks_agent.agent.status.ConfigurationGenerator.validate_traditional_config') as mock_validate: + mock_validate.return_value = True + + result = self.status_manager._get_configuration_status() + + assert result.mode == "traditional" + assert result.config_valid + + def test_get_configuration_status_with_exception(self): + """Test configuration status collection with exception.""" + with patch('os.path.exists', side_effect=Exception("Test error")): + + result = self.status_manager._get_configuration_status() + + assert result.mode == "unknown" + assert not result.config_valid + assert "Configuration status check failed" in result.error_message + + def test_determine_current_mode_mcp(self): + """Test mode determination for MCP mode.""" + config_status = ConfigStatus(mode="mcp") + binary_status = BinaryStatus(available=True, version_valid=True) + server_status = ServerStatus(running=True) + + result = self.status_manager._determine_current_mode(config_status, binary_status, server_status) + + assert result == "mcp" + + def test_determine_current_mode_traditional(self): + """Test mode determination for traditional mode.""" + config_status = ConfigStatus(mode="traditional") + binary_status = BinaryStatus(available=False) + server_status = ServerStatus(running=False) + + result = self.status_manager._determine_current_mode(config_status, binary_status, server_status) + + assert result == "traditional" + + def test_determine_current_mode_inferred_mcp(self): + """Test mode determination inferred as MCP from component status.""" + config_status = ConfigStatus(mode="unknown") + binary_status = BinaryStatus(available=True, version_valid=True) + server_status = ServerStatus(running=True) + + result = self.status_manager._determine_current_mode(config_status, binary_status, server_status) + + assert result == "mcp" + + def test_determine_current_mode_mcp_available(self): + """Test mode determination for MCP available but server not running.""" + config_status = ConfigStatus(mode="unknown") + binary_status = BinaryStatus(available=True) + server_status = ServerStatus(running=False) + + result = self.status_manager._determine_current_mode(config_status, binary_status, server_status) + + assert result == "mcp_available" + + def test_get_last_mode_from_file(self): + """Test getting last mode from state file.""" + state_file_path = os.path.join(self.temp_dir, "aks_agent_mode_state") + with open(state_file_path, 'w') as f: + json.dump({"last_mode": "mcp"}, f) + + result = self.status_manager._get_last_mode() + + assert result == "mcp" + + def test_get_last_mode_no_file(self): + """Test getting last mode when file doesn't exist.""" + result = self.status_manager._get_last_mode() + + assert result == "unknown" + + def test_get_last_mode_invalid_json(self): + """Test getting last mode with invalid JSON in file.""" + state_file_path = os.path.join(self.temp_dir, "aks_agent_mode_state") + with open(state_file_path, 'w') as f: + f.write("invalid json") + + result = self.status_manager._get_last_mode() + + assert result == "unknown" + + def test_get_last_mode_change_time(self): + """Test getting last mode change time.""" + state_file_path = os.path.join(self.temp_dir, "aks_agent_mode_state") + with open(state_file_path, 'w') as f: + json.dump({"last_mode": "mcp"}, f) + + result = self.status_manager._get_last_mode_change_time() + + assert result is not None + assert isinstance(result, datetime) + + def test_get_last_mode_change_time_no_file(self): + """Test getting last mode change time when file doesn't exist.""" + result = self.status_manager._get_last_mode_change_time() + + assert result is None + + def test_get_last_used_timestamp(self): + """Test getting last used timestamp.""" + # Create some files with different timestamps + config_file_path = os.path.join(self.temp_dir, "aksAgent.yaml") + state_file_path = os.path.join(self.temp_dir, "aks_agent_mode_state") + + with open(config_file_path, 'w') as f: + f.write("{}") + + with open(state_file_path, 'w') as f: + f.write("{}") + + result = self.status_manager._get_last_used_timestamp() + + assert result is not None + assert isinstance(result, datetime) + + def test_get_last_used_timestamp_no_files(self): + """Test getting last used timestamp when no files exist.""" + result = self.status_manager._get_last_used_timestamp() + + assert result is None + + def test_load_config_file_json(self): + """Test loading JSON configuration file.""" + config_file_path = os.path.join(self.temp_dir, "test_config.json") + config_data = {"test": "data"} + + with open(config_file_path, 'w') as f: + json.dump(config_data, f) + + result = self.status_manager._load_config_file(config_file_path) + + assert result == config_data + + def test_load_config_file_yaml(self): + """Test loading YAML configuration file.""" + config_file_path = os.path.join(self.temp_dir, "test_config.yaml") + + with open(config_file_path, 'w') as f: + f.write("test: data\n") + + with patch('yaml.safe_load') as mock_yaml: + mock_yaml.return_value = {"test": "data"} + + result = self.status_manager._load_config_file(config_file_path) + + assert result == {"test": "data"} + + def test_load_config_file_nonexistent(self): + """Test loading nonexistent configuration file.""" + result = self.status_manager._load_config_file("/nonexistent/file.json") + + assert result is None + + def test_load_config_file_invalid_json_falls_back_to_yaml(self): + """Test loading invalid JSON configuration file falls back to YAML then fails gracefully.""" + config_file_path = os.path.join(self.temp_dir, "invalid.json") + + with open(config_file_path, 'w') as f: + f.write("invalid json content") + + # Mock yaml to also fail, simulating no yaml library or invalid yaml + with patch('yaml.safe_load', side_effect=Exception("YAML parse error")): + result = self.status_manager._load_config_file(config_file_path) + + assert result is None + diff --git a/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_status_command.py b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_status_command.py new file mode 100644 index 00000000000..bd92fd5ea4f --- /dev/null +++ b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_status_command.py @@ -0,0 +1,223 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +""" +Tests for AKS Agent Status command registration and functionality. +""" + +import unittest +from unittest.mock import Mock, patch + + +class TestAksAgentStatusCommand(unittest.TestCase): + """Test AKS Agent Status command registration and functionality.""" + + def test_aks_agent_status_function_exists(self): + """Test that the aks_agent_status function is properly defined.""" + from azext_aks_agent.custom import aks_agent_status + + # Verify function exists and is callable + self.assertTrue(callable(aks_agent_status)) + + def test_aks_agent_status_function_signature(self): + """Test that the aks_agent_status function has correct signature.""" + import inspect + from azext_aks_agent.custom import aks_agent_status + + # Get function signature + sig = inspect.signature(aks_agent_status) + + # Verify required parameters exist + self.assertIn("cmd", sig.parameters) + + # Verify verbose parameter is not present + self.assertNotIn("verbose", sig.parameters) + + def test_aks_agent_status_basic_execution(self): + """Test basic execution of aks_agent_status function.""" + from azext_aks_agent.custom import aks_agent_status + + # Mock all the imports that happen inside the function + with patch('azext_aks_agent.agent.binary_manager.AksMcpBinaryManager') as mock_binary_manager_class, \ + patch('azure.cli.core.api.get_config_dir') as mock_get_config_dir, \ + patch('rich.console.Console') as mock_console_class: + + # Setup config dir mock + mock_get_config_dir.return_value = "/mock/config" + + # Mock binary manager instance + mock_binary_instance = Mock() + mock_binary_instance.is_binary_available.return_value = False + mock_binary_instance.get_binary_path.return_value = "/mock/binary/path" + mock_binary_manager_class.return_value = mock_binary_instance + + # Mock rich console + mock_console = Mock() + mock_console_class.return_value = mock_console + + # Mock command context + mock_cmd = Mock() + + # Execute function + result = aks_agent_status(mock_cmd) + + # Verify function completes without error and returns None (status displayed via console) + self.assertIsNone(result) + + # Verify rich console was used + mock_console_class.assert_called() + mock_console.print.assert_called() + + def test_aks_agent_status_with_binary_available(self): + """Test aks_agent_status when MCP binary is available.""" + from azext_aks_agent.custom import aks_agent_status + + # Mock all the imports that happen inside the function + with patch('azext_aks_agent.agent.binary_manager.AksMcpBinaryManager') as mock_binary_manager_class, \ + patch('azext_aks_agent.agent.mcp_manager.MCPManager') as mock_mcp_manager_class, \ + patch('azure.cli.core.api.get_config_dir') as mock_get_config_dir, \ + patch('rich.console.Console') as mock_console_class: + + # Setup config dir mock + mock_get_config_dir.return_value = "/mock/config" + + # Mock binary manager with available binary + mock_binary_instance = Mock() + mock_binary_instance.is_binary_available.return_value = True + mock_binary_instance.get_binary_version.return_value = "1.0.0" + mock_binary_instance.get_binary_path.return_value = "/mock/binary/path" + mock_binary_instance.validate_version.return_value = True + mock_binary_manager_class.return_value = mock_binary_instance + + # Mock MCP manager + mock_mcp_instance = Mock() + mock_mcp_instance.is_server_running.return_value = False + mock_mcp_instance.is_server_healthy.return_value = False + mock_mcp_instance.get_server_url.return_value = None + mock_mcp_instance.get_server_port.return_value = None + mock_mcp_manager_class.return_value = mock_mcp_instance + + # Mock rich console + mock_console = Mock() + mock_console_class.return_value = mock_console + + mock_cmd = Mock() + + result = aks_agent_status(mock_cmd) + + # Verify function completes and returns None (status displayed via console) + self.assertIsNone(result) + + # Verify rich console was used + mock_console_class.assert_called() + mock_console.print.assert_called() + + def test_aks_agent_status_error_handling(self): + """Test aks_agent_status error handling with graceful fallback.""" + from azext_aks_agent.custom import aks_agent_status + + # Mock to raise exception during config dir access + with patch('azure.cli.core.api.get_config_dir') as mock_get_config_dir, \ + patch('rich.console.Console') as mock_console_class: + mock_get_config_dir.side_effect = Exception("Config dir error") + + # Mock rich console + mock_console = Mock() + mock_console_class.return_value = mock_console + + mock_cmd = Mock() + + # The function should gracefully handle the error and return None (status displayed via console) + result = aks_agent_status(mock_cmd) + + # Verify function completes without raising CLIError and returns None + self.assertIsNone(result) + + # Verify rich console was used + mock_console_class.assert_called() + mock_console.print.assert_called() + + def test_display_agent_status_basic(self): + """Test _display_agent_status function basic functionality.""" + from azext_aks_agent.custom import _display_agent_status + + # Create test status info + status_info = { + "mode": "traditional", + "mcp_binary": { + "available": False, + "path": "/mock/path", + "version": None + }, + "server": { + "running": False, + "healthy": False, + "url": None, + "port": None + } + } + + with patch('rich.console.Console') as mock_console_class: + # Mock rich console + mock_console = Mock() + mock_console_class.return_value = mock_console + + _display_agent_status(status_info) + + # Verify rich console was used + mock_console_class.assert_called() + mock_console.print.assert_called() + + def test_display_agent_status_verbose(self): + """Test _display_agent_status function with detailed information.""" + from azext_aks_agent.custom import _display_agent_status + + # Create test status info with more details + status_info = { + "mode": "mcp_ready", + "mcp_binary": { + "available": True, + "path": "/mock/binary/path", + "version": "1.0.0", + "version_valid": True + }, + "server": { + "running": True, + "healthy": True, + "url": "http://localhost:8003/sse", + "port": 8003 + } + } + + with patch('rich.console.Console') as mock_console_class: + # Mock rich console + mock_console = Mock() + mock_console_class.return_value = mock_console + + _display_agent_status(status_info) + + # Verify rich console was used + mock_console_class.assert_called() + mock_console.print.assert_called() + + # Check that print was called multiple times (for table and recommendations) + self.assertTrue(mock_console.print.call_count >= 2) + + def test_command_registration(self): + """Test that the agent status command is properly registered.""" + # Import functions to verify presence + try: + from azext_aks_agent.custom import aks_agent_status + from azext_aks_agent.custom import _display_agent_status + + self.assertTrue(callable(aks_agent_status)) + self.assertTrue(callable(_display_agent_status)) + except ImportError as e: + self.fail(f"Command registration imports failed: {e}") + + +if __name__ == '__main__': + unittest.main() + diff --git a/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_status_models.py b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_status_models.py new file mode 100644 index 00000000000..fad7b16a05f --- /dev/null +++ b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_status_models.py @@ -0,0 +1,474 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +"""Tests for status data models.""" + +import os +import tempfile +import unittest +from datetime import datetime, timedelta +from unittest.mock import patch + +from azext_aks_agent.agent.status_models import ( + BinaryStatus, + ServerStatus, + ConfigStatus, + AgentStatus, +) + + +class TestBinaryStatus(unittest.TestCase): + """Test BinaryStatus data model.""" + + def test_default_initialization(self): + """Test default values are set correctly.""" + status = BinaryStatus() + + self.assertFalse(status.available) + self.assertIsNone(status.version) + self.assertIsNone(status.path) + self.assertIsNone(status.last_updated) + self.assertIsNone(status.size) + self.assertFalse(status.version_valid) + self.assertIsNone(status.error_message) + self.assertFalse(status.ready) + + def test_initialization_with_values(self): + """Test initialization with specific values.""" + now = datetime.now() + status = BinaryStatus( + available=True, + version="1.0.0", + path="/tmp/binary", + last_updated=now, + size=1024, + version_valid=True + ) + + self.assertTrue(status.available) + self.assertEqual(status.version, "1.0.0") + self.assertEqual(status.path, "/tmp/binary") + self.assertEqual(status.last_updated, now) + self.assertEqual(status.size, 1024) + self.assertTrue(status.version_valid) + self.assertTrue(status.ready) + + def test_ready_property(self): + """Test ready property logic.""" + status = BinaryStatus() + + # Not ready when not available + self.assertFalse(status.ready) + + # Not ready when available but version not valid + status.available = True + self.assertFalse(status.ready) + + # Ready when both available and version valid + status.version_valid = True + self.assertTrue(status.ready) + + def test_from_file_path_nonexistent_file(self): + """Test from_file_path with non-existent file.""" + status = BinaryStatus.from_file_path("/nonexistent/path") + + self.assertFalse(status.available) + self.assertEqual(status.path, "/nonexistent/path") + self.assertIsNone(status.version) + self.assertIsNone(status.size) + self.assertIsNone(status.last_updated) + self.assertFalse(status.ready) + + def test_from_file_path_existing_file(self): + """Test from_file_path with existing file.""" + with tempfile.NamedTemporaryFile(delete=False) as tmp: + tmp.write(b"test content") + tmp_path = tmp.name + + try: + status = BinaryStatus.from_file_path( + tmp_path, + version="1.0.0", + version_valid=True + ) + + self.assertTrue(status.available) + self.assertEqual(status.path, tmp_path) + self.assertEqual(status.version, "1.0.0") + self.assertTrue(status.version_valid) + self.assertGreater(status.size, 0) + self.assertIsInstance(status.last_updated, datetime) + self.assertTrue(status.ready) + finally: + os.unlink(tmp_path) + + @patch('os.stat') + def test_from_file_path_os_error(self, mock_stat): + """Test from_file_path handles OS errors.""" + mock_stat.side_effect = OSError("Permission denied") + + # First need to ensure the file exists for the initial check + with patch('os.path.exists', return_value=True): + status = BinaryStatus.from_file_path("/test/path") + + self.assertFalse(status.available) + self.assertEqual(status.path, "/test/path") + self.assertIn("Failed to get file info", status.error_message) + + +class TestServerStatus(unittest.TestCase): + """Test ServerStatus data model.""" + + def test_default_initialization(self): + """Test default values are set correctly.""" + status = ServerStatus() + + self.assertFalse(status.running) + self.assertFalse(status.healthy) + self.assertIsNone(status.url) + self.assertIsNone(status.port) + self.assertIsNone(status.pid) + self.assertIsNone(status.uptime) + self.assertIsNone(status.start_time) + self.assertIsNone(status.error_message) + + def test_status_text_property(self): + """Test status_text property.""" + status = ServerStatus() + + # Stopped + self.assertEqual(status.status_text, "Stopped") + + # Running but unhealthy + status.running = True + self.assertEqual(status.status_text, "Running (Unhealthy)") + + # Running and healthy + status.healthy = True + self.assertEqual(status.status_text, "Running (Healthy)") + + def test_uptime_text_property(self): + """Test uptime_text property formatting.""" + status = ServerStatus() + + # No uptime + self.assertEqual(status.uptime_text, "N/A") + + # Seconds only + status.uptime = timedelta(seconds=30) + self.assertEqual(status.uptime_text, "30s") + + # Minutes and seconds + status.uptime = timedelta(minutes=5, seconds=30) + self.assertEqual(status.uptime_text, "5m 30s") + + # Hours, minutes, and seconds + status.uptime = timedelta(hours=2, minutes=15, seconds=45) + self.assertEqual(status.uptime_text, "2h 15m 45s") + + def test_full_initialization(self): + """Test initialization with all values.""" + start_time = datetime.now() + uptime = timedelta(hours=1) + + status = ServerStatus( + running=True, + healthy=True, + url="http://localhost:8003", + port=8003, + pid=12345, + uptime=uptime, + start_time=start_time + ) + + self.assertTrue(status.running) + self.assertTrue(status.healthy) + self.assertEqual(status.url, "http://localhost:8003") + self.assertEqual(status.port, 8003) + self.assertEqual(status.pid, 12345) + self.assertEqual(status.uptime, uptime) + self.assertEqual(status.start_time, start_time) + self.assertEqual(status.status_text, "Running (Healthy)") + + +class TestConfigStatus(unittest.TestCase): + """Test ConfigStatus data model.""" + + def test_default_initialization(self): + """Test default values and post_init behavior.""" + status = ConfigStatus() + + self.assertEqual(status.mode, "unknown") + self.assertIsNone(status.config_file) + self.assertEqual(status.toolsets_enabled, []) + self.assertEqual(status.mcp_servers, []) + self.assertIsNone(status.last_mode_change) + self.assertTrue(status.config_valid) + self.assertIsNone(status.error_message) + + def test_initialization_with_values(self): + """Test initialization with specific values.""" + toolsets = ["aks/core", "kubernetes/core"] + mcp_servers = ["aks-mcp"] + now = datetime.now() + + status = ConfigStatus( + mode="mcp", + config_file="/config/file.yaml", + toolsets_enabled=toolsets, + mcp_servers=mcp_servers, + last_mode_change=now, + config_valid=False + ) + + self.assertEqual(status.mode, "mcp") + self.assertEqual(status.config_file, "/config/file.yaml") + self.assertEqual(status.toolsets_enabled, toolsets) + self.assertEqual(status.mcp_servers, mcp_servers) + self.assertEqual(status.last_mode_change, now) + self.assertFalse(status.config_valid) + + def test_mode_properties(self): + """Test mode detection properties.""" + status = ConfigStatus() + + # Unknown mode + self.assertFalse(status.is_mcp_mode) + self.assertFalse(status.is_traditional_mode) + + # MCP mode + status.mode = "mcp" + self.assertTrue(status.is_mcp_mode) + self.assertFalse(status.is_traditional_mode) + + # Traditional mode + status.mode = "traditional" + self.assertFalse(status.is_mcp_mode) + self.assertTrue(status.is_traditional_mode) + + # Case insensitive + status.mode = "MCP" + self.assertTrue(status.is_mcp_mode) + + status.mode = "TRADITIONAL" + self.assertTrue(status.is_traditional_mode) + + def test_count_properties(self): + """Test count properties.""" + status = ConfigStatus() + + self.assertEqual(status.active_toolsets_count, 0) + self.assertEqual(status.mcp_servers_count, 0) + + status.toolsets_enabled = ["toolset1", "toolset2", "toolset3"] + status.mcp_servers = ["server1", "server2"] + + self.assertEqual(status.active_toolsets_count, 3) + self.assertEqual(status.mcp_servers_count, 2) + + +class TestAgentStatus(unittest.TestCase): + """Test AgentStatus data model.""" + + def test_default_initialization(self): + """Test default values and component creation.""" + status = AgentStatus() + + self.assertEqual(status.mode, "unknown") + self.assertIsInstance(status.mcp_binary, BinaryStatus) + self.assertIsInstance(status.server, ServerStatus) + self.assertIsInstance(status.config, ConfigStatus) + self.assertIsNone(status.last_used) + self.assertEqual(status.overall_health, "unknown") + self.assertIsNone(status.error_message) + + def test_initialization_with_components(self): + """Test initialization with provided components.""" + binary = BinaryStatus(available=True, version_valid=True) + server = ServerStatus(running=True, healthy=True) + config = ConfigStatus(mode="mcp", config_valid=True) + + status = AgentStatus( + mode="mcp", + mcp_binary=binary, + server=server, + config=config + ) + + self.assertEqual(status.mode, "mcp") + self.assertEqual(status.mcp_binary, binary) + self.assertEqual(status.server, server) + self.assertEqual(status.config, config) + self.assertEqual(status.overall_health, "healthy") + + def test_overall_health_mcp_mode(self): + """Test overall health calculation in MCP mode.""" + status = AgentStatus() + status.config.mode = "mcp" + + # Unhealthy when binary not ready + status._update_overall_health() + self.assertEqual(status.overall_health, "degraded") + + # Unhealthy when server not running + status.mcp_binary.available = True + status.mcp_binary.version_valid = True + status._update_overall_health() + self.assertEqual(status.overall_health, "degraded") + + # Unhealthy when server not healthy + status.server.running = True + status._update_overall_health() + self.assertEqual(status.overall_health, "degraded") + + # Healthy when all components ready + status.server.healthy = True + status._update_overall_health() + self.assertEqual(status.overall_health, "healthy") + + def test_overall_health_traditional_mode(self): + """Test overall health calculation in traditional mode.""" + status = AgentStatus() + status.config.mode = "traditional" + + # Healthy when config valid + status._update_overall_health() + self.assertEqual(status.overall_health, "healthy") + + # Degraded when config invalid + status.config.config_valid = False + status._update_overall_health() + self.assertEqual(status.overall_health, "degraded") + + def test_overall_health_error(self): + """Test overall health with error message.""" + status = AgentStatus() + status.config.mode = "mcp" + status.mcp_binary.available = True + status.mcp_binary.version_valid = True + status.server.running = True + status.server.healthy = True + status.error_message = "Some error occurred" + + status._update_overall_health() + self.assertEqual(status.overall_health, "error") + + def test_health_properties(self): + """Test health status properties.""" + status = AgentStatus() + + # Unknown + status.overall_health = "unknown" + self.assertFalse(status.is_healthy) + self.assertFalse(status.is_operational) + + # Healthy + status.overall_health = "healthy" + self.assertTrue(status.is_healthy) + self.assertTrue(status.is_operational) + + # Degraded + status.overall_health = "degraded" + self.assertFalse(status.is_healthy) + self.assertTrue(status.is_operational) + + # Error + status.overall_health = "error" + self.assertFalse(status.is_healthy) + self.assertFalse(status.is_operational) + + def test_emoji_properties(self): + """Test emoji representation properties.""" + status = AgentStatus() + + # Health emojis + status.overall_health = "healthy" + self.assertEqual(status.health_emoji, "✅") + + status.overall_health = "degraded" + self.assertEqual(status.health_emoji, "⚠️") + + status.overall_health = "error" + self.assertEqual(status.health_emoji, "❌") + + status.overall_health = "unknown" + self.assertEqual(status.health_emoji, "❓") + + # Mode emojis + status.config.mode = "mcp" + self.assertEqual(status.mode_emoji, "🚀") + + status.config.mode = "traditional" + self.assertEqual(status.mode_emoji, "🔧") + + status.config.mode = "unknown" + self.assertEqual(status.mode_emoji, "❓") + + def test_get_summary(self): + """Test summary string generation.""" + status = AgentStatus() + status.overall_health = "healthy" + status.mode = "mcp" + status.config.mode = "mcp" + + summary = status.get_summary() + expected = "✅ Healthy | 🚀 Mcp Mode" + self.assertEqual(summary, expected) + + def test_get_recommendations_mcp_mode(self): + """Test recommendations for MCP mode.""" + status = AgentStatus() + status.config.mode = "mcp" + + # Binary not available + recommendations = status.get_recommendations() + self.assertIn("Download the MCP binary", recommendations[0]) + + # Binary available but version invalid + status.mcp_binary.available = True + recommendations = status.get_recommendations() + self.assertIn("Update the MCP binary", recommendations[0]) + + # Server not running + status.mcp_binary.version_valid = True + recommendations = status.get_recommendations() + self.assertIn("Start the MCP server", recommendations[0]) + + # Server not healthy + status.server.running = True + recommendations = status.get_recommendations() + self.assertIn("Check MCP server logs", recommendations[0]) + + def test_get_recommendations_traditional_mode(self): + """Test recommendations for traditional mode.""" + status = AgentStatus() + status.config.mode = "traditional" + + # With ready binary, suggest MCP + status.mcp_binary.available = True + status.mcp_binary.version_valid = True + recommendations = status.get_recommendations() + self.assertIn("Consider using MCP mode", recommendations[0]) + + def test_get_recommendations_error(self): + """Test recommendations with error.""" + status = AgentStatus() + status.error_message = "Test error" + + recommendations = status.get_recommendations() + self.assertIn("Resolve error: Test error", recommendations[0]) + + def test_get_recommendations_config_invalid(self): + """Test recommendations with invalid config.""" + status = AgentStatus() + status.config.config_valid = False + + recommendations = status.get_recommendations() + self.assertIn("Check configuration file", recommendations[0]) + + +if __name__ == "__main__": + unittest.main() + diff --git a/src/aks-agent/azext_aks_agent/tests/latest/test_mcp_manager.py b/src/aks-agent/azext_aks_agent/tests/latest/test_mcp_manager.py new file mode 100644 index 00000000000..478045d433e --- /dev/null +++ b/src/aks-agent/azext_aks_agent/tests/latest/test_mcp_manager.py @@ -0,0 +1,393 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +import unittest +from unittest import IsolatedAsyncioTestCase +import os +import tempfile +import asyncio +from unittest.mock import Mock, patch, AsyncMock + +from azext_aks_agent.agent.mcp_manager import MCPManager + + +class TestMCPManager(unittest.TestCase): + + def setUp(self): + """Set up test fixtures.""" + self.test_config_dir = tempfile.mkdtemp() + # Create the bin subdirectory that would be expected + self.test_bin_dir = os.path.join(self.test_config_dir, 'bin') + os.makedirs(self.test_bin_dir, exist_ok=True) + + # Set up event loop for async tests + self.loop = asyncio.new_event_loop() + asyncio.set_event_loop(self.loop) + + def tearDown(self): + """Clean up test fixtures.""" + import shutil + # Clean up async tasks + try: + self.loop.close() + except Exception: + pass + shutil.rmtree(self.test_config_dir, ignore_errors=True) + + @patch('azext_aks_agent.agent.mcp_manager.get_config_dir') + def test_init_with_default_config_dir(self, mock_get_config_dir): + """Test MCP manager initialization with default config directory.""" + mock_get_config_dir.return_value = '/mock/config/dir' + + manager = MCPManager() + + self.assertEqual(manager.config_dir, '/mock/config/dir') + self.assertFalse(manager.verbose) + self.assertIsNotNone(manager.binary_manager) + # Check server process management initialization + self.assertIsNone(manager.server_process) + self.assertIsNone(manager.server_url) + self.assertIsNone(manager.server_port) + mock_get_config_dir.assert_called_once() + + def test_init_with_custom_config_dir(self): + """Test MCP manager initialization with custom config directory.""" + manager = MCPManager(config_dir=self.test_config_dir, verbose=True) + + self.assertEqual(manager.config_dir, self.test_config_dir) + self.assertTrue(manager.verbose) + self.assertIsNotNone(manager.binary_manager) + # Check server process management initialization + self.assertIsNone(manager.server_process) + self.assertIsNone(manager.server_url) + self.assertIsNone(manager.server_port) + # Check that binary manager was initialized with correct path + expected_bin_path = os.path.join(self.test_config_dir, 'bin') + self.assertEqual(manager.binary_manager.install_dir, expected_bin_path) + + def test_is_binary_available_true(self): + """Test binary availability check when binary is available.""" + manager = MCPManager(config_dir=self.test_config_dir) + + with patch.object(manager.binary_manager, 'is_binary_available', return_value=True): + self.assertTrue(manager.is_binary_available()) + + def test_is_binary_available_false(self): + """Test binary availability check when binary is not available.""" + manager = MCPManager(config_dir=self.test_config_dir) + + with patch.object(manager.binary_manager, 'is_binary_available', return_value=False): + self.assertFalse(manager.is_binary_available()) + + def test_get_binary_version_with_version(self): + """Test getting binary version when version is available.""" + manager = MCPManager(config_dir=self.test_config_dir) + expected_version = "0.1.0" + + with patch.object(manager.binary_manager, 'get_binary_version', return_value=expected_version): + version = manager.get_binary_version() + self.assertEqual(version, expected_version) + + def test_get_binary_version_none(self): + """Test getting binary version when no version is available.""" + manager = MCPManager(config_dir=self.test_config_dir) + + with patch.object(manager.binary_manager, 'get_binary_version', return_value=None): + version = manager.get_binary_version() + self.assertIsNone(version) + + def test_get_binary_path(self): + """Test getting binary path.""" + manager = MCPManager(config_dir=self.test_config_dir) + expected_path = os.path.join(self.test_bin_dir, 'aks-mcp') + + with patch.object(manager.binary_manager, 'get_binary_path', return_value=expected_path): + path = manager.get_binary_path() + self.assertEqual(path, expected_path) + + def test_validate_binary_version_valid(self): + """Test binary version validation when version is valid.""" + manager = MCPManager(config_dir=self.test_config_dir) + + with patch.object(manager.binary_manager, 'validate_version', return_value=True): + self.assertTrue(manager.validate_binary_version()) + + def test_validate_binary_version_invalid(self): + """Test binary version validation when version is invalid.""" + manager = MCPManager(config_dir=self.test_config_dir) + + with patch.object(manager.binary_manager, 'validate_version', return_value=False): + self.assertFalse(manager.validate_binary_version()) + + +class TestMCPManagerServerLifecycle(IsolatedAsyncioTestCase): + """Test server lifecycle management functionality.""" + + def setUp(self): + """Set up test fixtures for server tests.""" + self.test_config_dir = tempfile.mkdtemp() + self.test_bin_dir = os.path.join(self.test_config_dir, 'bin') + os.makedirs(self.test_bin_dir, exist_ok=True) + + def tearDown(self): + """Clean up test fixtures.""" + import shutil + shutil.rmtree(self.test_config_dir, ignore_errors=True) + + def test_initial_server_state(self): + """Test initial server state after initialization.""" + manager = MCPManager(config_dir=self.test_config_dir) + + self.assertIsNone(manager.server_process) + self.assertIsNone(manager.server_url) + self.assertIsNone(manager.server_port) + self.assertFalse(manager.is_server_running()) + self.assertFalse(manager.is_server_healthy()) + self.assertIsNone(manager.get_server_url()) + self.assertIsNone(manager.get_server_port()) + + def test_find_available_port_default(self): + """Test finding available port starting from default.""" + manager = MCPManager(config_dir=self.test_config_dir) + + port = manager._find_available_port(8003) + self.assertGreaterEqual(port, 8003) + self.assertLess(port, 8103) # Should be within 100 port range + + def test_find_available_port_custom_start(self): + """Test finding available port with custom start port.""" + manager = MCPManager(config_dir=self.test_config_dir) + + port = manager._find_available_port(9000) + self.assertGreaterEqual(port, 9000) + self.assertLess(port, 9100) # Should be within 100 port range + + @patch('socket.socket') + def test_find_available_port_no_ports_available(self, mock_socket): + """Test exception when no ports are available.""" + manager = MCPManager(config_dir=self.test_config_dir) + + # Mock all sockets to fail binding + mock_socket.return_value.__enter__.return_value.bind.side_effect = OSError("Port in use") + + with self.assertRaises(Exception) as cm: + manager._find_available_port(8003) + + self.assertIn("No available ports found", str(cm.exception)) + + def test_is_server_running_no_process(self): + """Test is_server_running when no process exists.""" + manager = MCPManager(config_dir=self.test_config_dir) + self.assertFalse(manager.is_server_running()) + + def test_is_server_running_with_process(self): + """Test is_server_running with active process.""" + manager = MCPManager(config_dir=self.test_config_dir) + + # Mock an active process + mock_process = Mock() + mock_process.returncode = None # Process is still running + manager.server_process = mock_process + + self.assertTrue(manager.is_server_running()) + + def test_is_server_running_with_terminated_process(self): + """Test is_server_running with terminated process.""" + manager = MCPManager(config_dir=self.test_config_dir) + + # Mock a terminated process + mock_process = Mock() + mock_process.returncode = 0 # Process has exited + manager.server_process = mock_process + + self.assertFalse(manager.is_server_running()) + + @patch('urllib.request.urlopen') + def test_is_server_healthy_success(self, mock_urlopen): + """Test server health check success.""" + manager = MCPManager(config_dir=self.test_config_dir) + + # Setup server state + manager.server_process = Mock() + manager.server_process.returncode = None + manager.server_url = "http://localhost:8003/sse" + + # Mock successful HTTP response + mock_response = Mock() + mock_response.status = 200 + mock_urlopen.return_value.__enter__.return_value = mock_response + + self.assertTrue(manager.is_server_healthy()) + mock_urlopen.assert_called_once_with("http://localhost:8003/sse", timeout=3) + + @patch('urllib.request.urlopen') + def test_is_server_healthy_http_error(self, mock_urlopen): + """Test server health check HTTP error.""" + manager = MCPManager(config_dir=self.test_config_dir) + + # Setup server state + manager.server_process = Mock() + manager.server_process.returncode = None + manager.server_url = "http://localhost:8003/sse" + + # Mock HTTP error + import urllib.error + mock_urlopen.side_effect = urllib.error.HTTPError( + "http://localhost:8003/sse", 500, "Server Error", {}, None + ) + + self.assertFalse(manager.is_server_healthy()) + + def test_is_server_healthy_no_url(self): + """Test server health check when no URL is set.""" + manager = MCPManager(config_dir=self.test_config_dir) + + # Setup server process but no URL + manager.server_process = Mock() + manager.server_process.returncode = None + # manager.server_url remains None + + self.assertFalse(manager.is_server_healthy()) + + def test_is_server_healthy_not_running(self): + """Test server health check when server is not running.""" + manager = MCPManager(config_dir=self.test_config_dir) + + # No server process + self.assertFalse(manager.is_server_healthy()) + + @patch('azext_aks_agent.agent.mcp_manager.asyncio.create_subprocess_exec') + @patch('azext_aks_agent.agent.mcp_manager.asyncio.sleep') + async def test_start_server_success(self, mock_sleep, mock_create_subprocess): + """Test successful server start.""" + manager = MCPManager(config_dir=self.test_config_dir) + + # Mock binary availability + with patch.object(manager, 'is_binary_available', return_value=True): + with patch.object(manager, 'get_binary_path', return_value='/fake/aks-mcp'): + with patch.object(manager, '_find_available_port', return_value=8003): + with patch.object(manager, 'is_server_healthy', return_value=True): + + # Mock subprocess creation + mock_process = AsyncMock() + mock_create_subprocess.return_value = mock_process + + result = await manager.start_server() + + self.assertTrue(result) + self.assertEqual(manager.server_process, mock_process) + self.assertEqual(manager.server_url, "http://localhost:8003/sse") + self.assertEqual(manager.server_port, 8003) + + # Verify subprocess was called correctly + mock_create_subprocess.assert_called_once() + args = mock_create_subprocess.call_args[0] + self.assertEqual(args, ('/fake/aks-mcp', '--transport', 'sse', '--port', '8003')) + + @patch('azext_aks_agent.agent.mcp_manager.asyncio.create_subprocess_exec') + @patch('azext_aks_agent.agent.mcp_manager.asyncio.sleep') + async def test_start_server_already_running_and_healthy(self, mock_sleep, mock_create_subprocess): + """Test start_server when server is already running and healthy.""" + manager = MCPManager(config_dir=self.test_config_dir, verbose=True) + + with patch.object(manager, 'is_binary_available', return_value=True): + with patch.object(manager, 'is_server_running', return_value=True): + with patch.object(manager, 'is_server_healthy', return_value=True): + with patch('azext_aks_agent.agent.user_feedback.ProgressReporter.show_status_message') as mock_progress: + + result = await manager.start_server() + + self.assertTrue(result) + # Should not create new subprocess + mock_create_subprocess.assert_not_called() + # Should show status message in verbose mode + mock_progress.assert_called_with("MCP server is already running and healthy", "info") + + async def test_start_server_unhealthy_restart(self): + """Test start_server restarts unhealthy running server.""" + manager = MCPManager(config_dir=self.test_config_dir) + + with patch.object(manager, 'is_binary_available', return_value=True): + with patch.object(manager, 'is_server_running', return_value=True): + with patch.object(manager, 'is_server_healthy', return_value=False): + with patch.object(manager, 'stop_server') as mock_stop: + with patch.object(manager, '_find_available_port', return_value=8003): + with patch.object(manager, 'get_binary_path', return_value='/fake/aks-mcp'): + with patch('azext_aks_agent.agent.mcp_manager.asyncio.create_subprocess_exec') as mock_create: + with patch('azext_aks_agent.agent.mcp_manager.asyncio.sleep'): + + # Mock the health check to fail first time, succeed second time + health_calls = [False, True] + + def side_effect(*args, **kwargs): + return health_calls.pop(0) if health_calls else True + + with patch.object(manager, 'is_server_healthy', side_effect=side_effect): + mock_process = AsyncMock() + mock_create.return_value = mock_process + + result = await manager.start_server() + + self.assertTrue(result) + mock_stop.assert_called_once() + + def test_stop_server_no_process(self): + """Test stop_server when no process exists.""" + manager = MCPManager(config_dir=self.test_config_dir) + + # Should not raise exception + manager.stop_server() + + self.assertIsNone(manager.server_process) + self.assertIsNone(manager.server_url) + self.assertIsNone(manager.server_port) + + def test_get_server_url_running(self): + """Test get_server_url when server is running.""" + manager = MCPManager(config_dir=self.test_config_dir) + + # Mock running server + manager.server_process = Mock() + manager.server_process.returncode = None + manager.server_url = "http://localhost:8003/sse" + + self.assertEqual(manager.get_server_url(), "http://localhost:8003/sse") + + def test_get_server_url_not_running(self): + """Test get_server_url when server is not running.""" + manager = MCPManager(config_dir=self.test_config_dir) + + self.assertIsNone(manager.get_server_url()) + + def test_get_server_port_running(self): + """Test get_server_port when server is running.""" + manager = MCPManager(config_dir=self.test_config_dir) + + # Mock running server + manager.server_process = Mock() + manager.server_process.returncode = None + manager.server_port = 8003 + + self.assertEqual(manager.get_server_port(), 8003) + + def test_get_server_port_not_running(self): + """Test get_server_port when server is not running.""" + manager = MCPManager(config_dir=self.test_config_dir) + + self.assertIsNone(manager.get_server_port()) + + +if __name__ == '__main__': + # Run tests including async tests + loader = unittest.TestLoader() + suite = unittest.TestSuite() + + # Add regular test cases + suite.addTests(loader.loadTestsFromTestCase(TestMCPManager)) + suite.addTests(loader.loadTestsFromTestCase(TestMCPManagerServerLifecycle)) + + runner = unittest.TextTestRunner(verbosity=2) + result = runner.run(suite) + diff --git a/src/aks-agent/azext_aks_agent/tests/latest/test_user_feedback.py b/src/aks-agent/azext_aks_agent/tests/latest/test_user_feedback.py new file mode 100644 index 00000000000..f210b0586f5 --- /dev/null +++ b/src/aks-agent/azext_aks_agent/tests/latest/test_user_feedback.py @@ -0,0 +1,218 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +import unittest +from unittest.mock import Mock, patch + +from azext_aks_agent.agent.user_feedback import ProgressReporter + + +class TestProgressReporter(unittest.TestCase): + """Test cases for ProgressReporter class.""" + + def setUp(self): + """Set up test fixtures.""" + self.mock_console = Mock() + + def test_show_download_progress_with_console(self): + """Test download progress reporting with provided console.""" + # Test normal progress + with patch('sys.stdout.isatty', return_value=True): + ProgressReporter.show_download_progress( + downloaded=500, + total=1000, + filename="test.bin", + console=self.mock_console + ) + + self.mock_console.print.assert_called_once() + call_args = self.mock_console.print.call_args[0][0] + self.assertIn("Downloading test.bin", call_args) + self.assertIn("50.0%", call_args) + + def test_show_download_progress_completion(self): + """Test download progress at completion.""" + with patch('sys.stdout.isatty', return_value=True): + ProgressReporter.show_download_progress( + downloaded=1000, + total=1000, + filename="complete.bin", + console=self.mock_console + ) + + # Should be called twice - once for progress, once for newline + self.assertEqual(self.mock_console.print.call_count, 2) + + def test_show_download_progress_zero_total(self): + """Test download progress with zero total bytes.""" + ProgressReporter.show_download_progress( + downloaded=100, + total=0, + filename="zero.bin", + console=self.mock_console + ) + + # Should not call console when total is 0 + self.mock_console.print.assert_not_called() + + def test_show_download_progress_no_tty(self): + """Test download progress when not in interactive terminal.""" + with patch('sys.stdout.isatty', return_value=False): + ProgressReporter.show_download_progress( + downloaded=500, + total=1000, + filename="notty.bin", + console=self.mock_console + ) + + # Should not call console when not TTY + self.mock_console.print.assert_not_called() + + def test_show_download_progress_no_console_holmes_available(self): + """Test download progress without console but Holmes available.""" + mock_holmes_console = Mock() + + with patch('holmes.utils.console.logging.init_logging', return_value=mock_holmes_console), \ + patch('sys.stdout.isatty', return_value=True): + ProgressReporter.show_download_progress( + downloaded=250, + total=1000, + filename="holmes.bin" + ) + + mock_holmes_console.print.assert_called_once() + + @patch('builtins.print') + def test_show_download_progress_no_console_no_holmes(self, mock_print): + """Test download progress fallback when Holmes not available.""" + with patch('holmes.utils.console.logging.init_logging', + side_effect=ImportError("Holmes not available")): + ProgressReporter.show_download_progress( + downloaded=750, + total=1000, + filename="fallback.bin" + ) + + mock_print.assert_called_once() + call_args = mock_print.call_args[0][0] + self.assertIn("Downloading fallback.bin", call_args) + self.assertIn("75.0%", call_args) + + def test_show_status_message_all_levels(self): + """Test status messages with different levels.""" + test_cases = [ + ("info", "[cyan]Test info[/cyan]"), + ("warning", "[yellow]Test warning[/yellow]"), + ("error", "[red]Test error[/red]"), + ("success", "[green]Test success[/green]"), + ("unknown", "[cyan]Test unknown[/cyan]"), # Should default to info style + ] + + for level, expected_style in test_cases: + with self.subTest(level=level): + self.mock_console.reset_mock() + ProgressReporter.show_status_message( + f"Test {level}", + level=level, + console=self.mock_console + ) + + self.mock_console.print.assert_called_once_with(expected_style) + + def test_show_status_message_no_console_holmes_available(self): + """Test status message without console but Holmes available.""" + mock_holmes_console = Mock() + + with patch('holmes.utils.console.logging.init_logging', return_value=mock_holmes_console): + ProgressReporter.show_status_message("Test message", "info") + + mock_holmes_console.print.assert_called_once_with("[cyan]Test message[/cyan]") + + @patch('builtins.print') + def test_show_status_message_no_console_no_holmes(self, mock_print): + """Test status message fallback when Holmes not available.""" + with patch('holmes.utils.console.logging.init_logging', + side_effect=ImportError("Holmes not available")): + ProgressReporter.show_status_message("Fallback message", "error") + + mock_print.assert_called_once_with("[ERROR] Fallback message") + + def test_show_binary_setup_status(self): + """Test binary setup status message.""" + ProgressReporter.show_binary_setup_status( + "Binary downloaded successfully", + console=self.mock_console + ) + + self.mock_console.print.assert_called_once_with( + "[cyan]MCP Binary: Binary downloaded successfully[/cyan]" + ) + + def test_show_server_status(self): + """Test server status message.""" + ProgressReporter.show_server_status( + "Server started on port 8003", + console=self.mock_console + ) + + self.mock_console.print.assert_called_once_with( + "[cyan]MCP Server: Server started on port 8003[/cyan]" + ) + + def test_show_server_status_silent_mode(self): + """Test server status in silent mode.""" + ProgressReporter.show_server_status( + "Server status", + silent_mode=True, + console=self.mock_console + ) + + # Currently still shows in silent mode - behavior may change in future + self.mock_console.print.assert_called_once() + + def test_progress_bar_formatting(self): + """Test progress bar formatting at various percentages.""" + test_cases = [ + (0, 1000, 0), # 0% + (250, 1000, 25), # 25% + (500, 1000, 50), # 50% + (750, 1000, 75), # 75% + (1000, 1000, 100), # 100% + (1200, 1000, 100), # Over 100% (clamped) + ] + + for downloaded, total, expected_percentage in test_cases: + with self.subTest(downloaded=downloaded, total=total): + self.mock_console.reset_mock() + + with patch('sys.stdout.isatty', return_value=True): + ProgressReporter.show_download_progress( + downloaded=downloaded, + total=total, + filename="test.bin", + console=self.mock_console + ) + + # For 100%, there are two calls - progress line and newline + if expected_percentage == 100: + self.assertEqual(self.mock_console.print.call_count, 2) + # First call is the progress line + call_args = self.mock_console.print.call_args_list[0][0][0] + else: + self.assertEqual(self.mock_console.print.call_count, 1) + call_args = self.mock_console.print.call_args[0][0] + + self.assertIn(f"{expected_percentage}.0%", call_args) + + # Check that progress bar contains appropriate characters + if expected_percentage > 0: + self.assertIn("█", call_args) # Filled portion should exist if > 0% + if expected_percentage < 100: + self.assertIn("░", call_args) # Empty portion should exist if < 100% + + +if __name__ == '__main__': + unittest.main() + diff --git a/src/aks-agent/setup.py b/src/aks-agent/setup.py index 50725225ecb..dfeb8a9d7dd 100644 --- a/src/aks-agent/setup.py +++ b/src/aks-agent/setup.py @@ -9,7 +9,7 @@ from setuptools import find_packages, setup -VERSION = "1.0.0b1" +VERSION = "1.0.0b2" CLASSIFIERS = [ "Development Status :: 4 - Beta", From 1910d8eef736c5ac1bdfc7f196abb88a0cb37c72 Mon Sep 17 00:00:00 2001 From: Qi Ni Date: Thu, 4 Sep 2025 15:49:43 +1000 Subject: [PATCH 02/11] use --status instead of status --- src/aks-agent/azext_aks_agent/_help.py | 17 +++++------------ src/aks-agent/azext_aks_agent/_params.py | 7 +++++++ .../azext_aks_agent/agent/binary_manager.py | 2 +- src/aks-agent/azext_aks_agent/commands.py | 4 ---- src/aks-agent/azext_aks_agent/custom.py | 4 ++++ 5 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/aks-agent/azext_aks_agent/_help.py b/src/aks-agent/azext_aks_agent/_help.py index d3b4dc86dd0..5d9a85f32f2 100644 --- a/src/aks-agent/azext_aks_agent/_help.py +++ b/src/aks-agent/azext_aks_agent/_help.py @@ -48,6 +48,9 @@ - name: --refresh-toolsets type: bool short-summary: Refresh the toolsets status. + - name: --status + type: bool + short-summary: Show AKS agent configuration and status information. - name: --no-aks-mcp type: bool short-summary: Disable AKS MCP integration and use traditional toolsets. @@ -75,6 +78,8 @@ text: az aks agent "What is the status of my cluster?" --no-echo-request --model azure/my-gpt4.1-deployment - name: Refresh toolsets to get the latest available tools text: az aks agent "What is the status of my cluster?" --refresh-toolsets --model azure/my-gpt4.1-deployment + - name: Show agent status (MCP readiness) + text: az aks agent --status - name: Run agent with config file text: | az aks agent "Check kubernetes pod resource usage" --config-file /path/to/custom.yaml @@ -107,15 +112,3 @@ enabled: false ``` """ - -helps[ - "aks agent status" -] = """ - type: command - short-summary: Show AKS agent configuration and readiness (MCP binary, server, and mode). - long-summary: |- - Displays MCP binary availability and version, MCP server running/healthy state (if applicable), and overall mode. - examples: - - name: Show agent status - text: az aks agent status -""" diff --git a/src/aks-agent/azext_aks_agent/_params.py b/src/aks-agent/azext_aks_agent/_params.py index 31d91596852..320d8062bf4 100644 --- a/src/aks-agent/azext_aks_agent/_params.py +++ b/src/aks-agent/azext_aks_agent/_params.py @@ -17,6 +17,7 @@ def load_arguments(self, _): with self.argument_context("aks agent") as c: c.positional( "prompt", + nargs='?', help="Ask any question and answer using available tools.", ) c.argument( @@ -77,6 +78,12 @@ def load_arguments(self, _): help="Refresh the toolsets status.", action="store_true", ) + c.argument( + "status", + options_list=["--status"], + action="store_true", + help="Show AKS agent configuration and status information.", + ) c.argument( "no_aks_mcp", options_list=["--no-aks-mcp"], diff --git a/src/aks-agent/azext_aks_agent/agent/binary_manager.py b/src/aks-agent/azext_aks_agent/agent/binary_manager.py index 02ac4d7b93e..73f77bb9ffc 100644 --- a/src/aks-agent/azext_aks_agent/agent/binary_manager.py +++ b/src/aks-agent/azext_aks_agent/agent/binary_manager.py @@ -311,7 +311,7 @@ def _get_platform_binary_name(self) -> str: return binary_name - def _verify_binary_integrity(self, file_path: str, release_info: dict = None) -> bool: # pylint: disable=too-many-locals,too-many-return-statements,too-many-nested-blocks + def _verify_binary_integrity(self, file_path: str, release_info: dict = None) -> bool: # pylint: disable=too-many-locals,too-many-return-statements,too-many-nested-blocks,too-many-branches """ Verify downloaded binary integrity using in-toto attestation files. diff --git a/src/aks-agent/azext_aks_agent/commands.py b/src/aks-agent/azext_aks_agent/commands.py index 93205a8739e..726dbb56590 100644 --- a/src/aks-agent/azext_aks_agent/commands.py +++ b/src/aks-agent/azext_aks_agent/commands.py @@ -14,7 +14,3 @@ def load_command_table(self, _): "aks", ) as g: g.custom_command("agent", "aks_agent") - with self.command_group( - "aks agent", - ) as g: - g.custom_command("status", "aks_agent_status") diff --git a/src/aks-agent/azext_aks_agent/custom.py b/src/aks-agent/azext_aks_agent/custom.py index 0cbfe533265..4d1d05d5b85 100644 --- a/src/aks-agent/azext_aks_agent/custom.py +++ b/src/aks-agent/azext_aks_agent/custom.py @@ -27,8 +27,12 @@ def aks_agent( no_echo_request=False, show_tool_output=False, refresh_toolsets=False, + status=False, no_aks_mcp=False, ): + # If only status is requested, display and return early + if status: + return aks_agent_status(cmd) aks_agent_internal( cmd, From 6cfba8d18c1f12c55ade1bb7d0fd416c091f027b Mon Sep 17 00:00:00 2001 From: Qi Ni Date: Thu, 4 Sep 2025 17:09:12 +1000 Subject: [PATCH 03/11] address ai comments --- src/aks-agent/azext_aks_agent/agent/mcp_manager.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/aks-agent/azext_aks_agent/agent/mcp_manager.py b/src/aks-agent/azext_aks_agent/agent/mcp_manager.py index e641c313ca4..069bad9c69d 100644 --- a/src/aks-agent/azext_aks_agent/agent/mcp_manager.py +++ b/src/aks-agent/azext_aks_agent/agent/mcp_manager.py @@ -173,6 +173,7 @@ def stop_server(self) -> None: # pylint: disable=too-many-nested-blocks start_time = time.time() timeout = 8.0 check_interval = 0.1 + last_log_time = start_time # limit verbose logs to once every ~2 seconds while (time.time() - start_time) < timeout: time.sleep(check_interval) @@ -197,10 +198,15 @@ def stop_server(self) -> None: # pylint: disable=too-many-nested-blocks ) return - # Debug: Check every 2 seconds in verbose mode - if self.verbose and (time.time() - start_time) % 2.0 < check_interval: - elapsed = time.time() - start_time - ProgressReporter.show_status_message(f"Waiting for graceful shutdown... {elapsed:.1f}s", "info") + # Debug: emit a message at most once every ~2 seconds in verbose mode + if self.verbose: + now = time.time() + if (now - last_log_time) >= 2.0: + elapsed = now - start_time + ProgressReporter.show_status_message( + f"Waiting for graceful shutdown... {elapsed:.1f}s", "info" + ) + last_log_time = now # If we get here, timeout was reached and process might still be running try: From 0fd6ddedb3b998e477d27c66006e5787f05d4da3 Mon Sep 17 00:00:00 2001 From: Qi Ni Date: Fri, 5 Sep 2025 10:36:02 +1000 Subject: [PATCH 04/11] style --- src/aks-agent/azext_aks_agent/agent/mcp_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aks-agent/azext_aks_agent/agent/mcp_manager.py b/src/aks-agent/azext_aks_agent/agent/mcp_manager.py index 069bad9c69d..270ed47b08a 100644 --- a/src/aks-agent/azext_aks_agent/agent/mcp_manager.py +++ b/src/aks-agent/azext_aks_agent/agent/mcp_manager.py @@ -150,7 +150,7 @@ async def start_server(self) -> bool: self.stop_server() raise RuntimeError(f"Failed to start MCP server: {str(e)}") from e - def stop_server(self) -> None: # pylint: disable=too-many-nested-blocks + def stop_server(self) -> None: # pylint: disable=too-many-nested-blocks, too-many-branches """ Stop aks-mcp server process. """ From 615bbd6f6d2e1809dc12f4fbf9f27a368b25a78e Mon Sep 17 00:00:00 2001 From: Qi Ni Date: Mon, 8 Sep 2025 15:12:07 +1000 Subject: [PATCH 05/11] add pytest-asyncio dependency --- src/aks-agent/setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/aks-agent/setup.py b/src/aks-agent/setup.py index dfeb8a9d7dd..4b14b4696f0 100644 --- a/src/aks-agent/setup.py +++ b/src/aks-agent/setup.py @@ -25,6 +25,7 @@ DEPENDENCIES = [ "holmesgpt==0.12.6; python_version >= '3.10'", + "pytest-asyncio>=1.1.0", ] with open1("README.rst", "r", encoding="utf-8") as f: From 68c09de1a6be8d6599ec91b781e660f1e09549f3 Mon Sep 17 00:00:00 2001 From: Qi Ni Date: Tue, 9 Sep 2025 11:06:28 +1000 Subject: [PATCH 06/11] fix unit tests --- src/aks-agent/azext_aks_agent/agent/status.py | 5 ++ .../azext_aks_agent/tests/latest/conftest.py | 27 ++++++++++ .../{test_agent.py => test_aks_agent.py} | 0 ...nager.py => test_aks_agent_mcp_manager.py} | 5 +- .../tests/latest/test_aks_agent_status.py | 10 ++-- .../latest/test_aks_agent_status_command.py | 3 +- .../latest/test_aks_agent_status_models.py | 13 +++-- ...ack.py => test_aks_agent_user_feedback.py} | 49 +------------------ ...dators.py => test_aks_agent_validators.py} | 1 + 9 files changed, 51 insertions(+), 62 deletions(-) create mode 100644 src/aks-agent/azext_aks_agent/tests/latest/conftest.py rename src/aks-agent/azext_aks_agent/tests/latest/{test_agent.py => test_aks_agent.py} (100%) rename src/aks-agent/azext_aks_agent/tests/latest/{test_mcp_manager.py => test_aks_agent_mcp_manager.py} (99%) rename src/aks-agent/azext_aks_agent/tests/latest/{test_user_feedback.py => test_aks_agent_user_feedback.py} (75%) rename src/aks-agent/azext_aks_agent/tests/latest/{test_validators.py => test_aks_agent_validators.py} (99%) diff --git a/src/aks-agent/azext_aks_agent/agent/status.py b/src/aks-agent/azext_aks_agent/agent/status.py index 33eb1bdfc97..8df6c63e8fc 100644 --- a/src/aks-agent/azext_aks_agent/agent/status.py +++ b/src/aks-agent/azext_aks_agent/agent/status.py @@ -1,3 +1,8 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + """ Status collection and management for AKS Agent. diff --git a/src/aks-agent/azext_aks_agent/tests/latest/conftest.py b/src/aks-agent/azext_aks_agent/tests/latest/conftest.py new file mode 100644 index 00000000000..8a51501a726 --- /dev/null +++ b/src/aks-agent/azext_aks_agent/tests/latest/conftest.py @@ -0,0 +1,27 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +""" +Pytest configuration for azext_aks_agent tests. + +Skips the entire test suite in this folder when running under Python < 3.10. +""" + +import sys +import pytest + + +def pytest_collection_modifyitems(config, items): + """Mark all tests in this folder as skipped on Python < 3.10. + + This ensures tests are still collected (so pytest exits with code 0), + but are skipped during execution under older Python versions. + """ + if sys.version_info < (3, 10): + skip_marker = pytest.mark.skip( + reason="azext_aks_agent tests require Python >= 3.10" + ) + for item in items: + item.add_marker(skip_marker) diff --git a/src/aks-agent/azext_aks_agent/tests/latest/test_agent.py b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent.py similarity index 100% rename from src/aks-agent/azext_aks_agent/tests/latest/test_agent.py rename to src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent.py diff --git a/src/aks-agent/azext_aks_agent/tests/latest/test_mcp_manager.py b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_mcp_manager.py similarity index 99% rename from src/aks-agent/azext_aks_agent/tests/latest/test_mcp_manager.py rename to src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_mcp_manager.py index 478045d433e..1743db4270b 100644 --- a/src/aks-agent/azext_aks_agent/tests/latest/test_mcp_manager.py +++ b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_mcp_manager.py @@ -37,7 +37,7 @@ def tearDown(self): shutil.rmtree(self.test_config_dir, ignore_errors=True) @patch('azext_aks_agent.agent.mcp_manager.get_config_dir') - def test_init_with_default_config_dir(self, mock_get_config_dir): + def test_mcp_manager_init_with_default_config_dir(self, mock_get_config_dir): """Test MCP manager initialization with default config directory.""" mock_get_config_dir.return_value = '/mock/config/dir' @@ -52,7 +52,7 @@ def test_init_with_default_config_dir(self, mock_get_config_dir): self.assertIsNone(manager.server_port) mock_get_config_dir.assert_called_once() - def test_init_with_custom_config_dir(self): + def test_mcp_manager_init_with_custom_config_dir(self): """Test MCP manager initialization with custom config directory.""" manager = MCPManager(config_dir=self.test_config_dir, verbose=True) @@ -390,4 +390,3 @@ def test_get_server_port_not_running(self): runner = unittest.TextTestRunner(verbosity=2) result = runner.run(suite) - diff --git a/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_status.py b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_status.py index 54a98f93ddc..995e0efdd4d 100644 --- a/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_status.py +++ b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_status.py @@ -1,3 +1,8 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + """ Unit tests for agent status collection functionality. """ @@ -26,7 +31,7 @@ def teardown_method(self): import shutil shutil.rmtree(self.temp_dir, ignore_errors=True) - def test_init_with_default_config_dir(self): + def test_status_manager_init_with_default_config_dir(self): """Test initialization with default config directory.""" with patch('azext_aks_agent.agent.status.get_config_dir') as mock_get_config_dir: mock_get_config_dir.return_value = '/mock/config/dir' @@ -36,7 +41,7 @@ def test_init_with_default_config_dir(self): assert manager.config_dir == '/mock/config/dir' mock_get_config_dir.assert_called_once() - def test_init_with_custom_config_dir(self): + def test_status_manager_init_with_custom_config_dir(self): """Test initialization with custom config directory.""" custom_dir = '/custom/config/dir' manager = AgentStatusManager(config_dir=custom_dir) @@ -387,4 +392,3 @@ def test_load_config_file_invalid_json_falls_back_to_yaml(self): result = self.status_manager._load_config_file(config_file_path) assert result is None - diff --git a/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_status_command.py b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_status_command.py index bd92fd5ea4f..aa237d89538 100644 --- a/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_status_command.py +++ b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_status_command.py @@ -205,7 +205,7 @@ def test_display_agent_status_verbose(self): # Check that print was called multiple times (for table and recommendations) self.assertTrue(mock_console.print.call_count >= 2) - def test_command_registration(self): + def test_status_command_registration(self): """Test that the agent status command is properly registered.""" # Import functions to verify presence try: @@ -220,4 +220,3 @@ def test_command_registration(self): if __name__ == '__main__': unittest.main() - diff --git a/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_status_models.py b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_status_models.py index fad7b16a05f..ede96d59175 100644 --- a/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_status_models.py +++ b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_status_models.py @@ -22,7 +22,7 @@ class TestBinaryStatus(unittest.TestCase): """Test BinaryStatus data model.""" - def test_default_initialization(self): + def test_binary_status_default_initialization(self): """Test default values are set correctly.""" status = BinaryStatus() @@ -35,7 +35,7 @@ def test_default_initialization(self): self.assertIsNone(status.error_message) self.assertFalse(status.ready) - def test_initialization_with_values(self): + def test_binary_status_initialization_with_values(self): """Test initialization with specific values.""" now = datetime.now() status = BinaryStatus( @@ -121,7 +121,7 @@ def test_from_file_path_os_error(self, mock_stat): class TestServerStatus(unittest.TestCase): """Test ServerStatus data model.""" - def test_default_initialization(self): + def test_server_status_default_initialization(self): """Test default values are set correctly.""" status = ServerStatus() @@ -196,7 +196,7 @@ def test_full_initialization(self): class TestConfigStatus(unittest.TestCase): """Test ConfigStatus data model.""" - def test_default_initialization(self): + def test_config_status_default_initialization(self): """Test default values and post_init behavior.""" status = ConfigStatus() @@ -208,7 +208,7 @@ def test_default_initialization(self): self.assertTrue(status.config_valid) self.assertIsNone(status.error_message) - def test_initialization_with_values(self): + def test_config_status_initialization_with_values(self): """Test initialization with specific values.""" toolsets = ["aks/core", "kubernetes/core"] mcp_servers = ["aks-mcp"] @@ -272,7 +272,7 @@ def test_count_properties(self): class TestAgentStatus(unittest.TestCase): """Test AgentStatus data model.""" - def test_default_initialization(self): + def test_agent_status_default_initialization(self): """Test default values and component creation.""" status = AgentStatus() @@ -471,4 +471,3 @@ def test_get_recommendations_config_invalid(self): if __name__ == "__main__": unittest.main() - diff --git a/src/aks-agent/azext_aks_agent/tests/latest/test_user_feedback.py b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_user_feedback.py similarity index 75% rename from src/aks-agent/azext_aks_agent/tests/latest/test_user_feedback.py rename to src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_user_feedback.py index f210b0586f5..3a13cbfedd9 100644 --- a/src/aks-agent/azext_aks_agent/tests/latest/test_user_feedback.py +++ b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_user_feedback.py @@ -70,35 +70,7 @@ def test_show_download_progress_no_tty(self): # Should not call console when not TTY self.mock_console.print.assert_not_called() - def test_show_download_progress_no_console_holmes_available(self): - """Test download progress without console but Holmes available.""" - mock_holmes_console = Mock() - - with patch('holmes.utils.console.logging.init_logging', return_value=mock_holmes_console), \ - patch('sys.stdout.isatty', return_value=True): - ProgressReporter.show_download_progress( - downloaded=250, - total=1000, - filename="holmes.bin" - ) - - mock_holmes_console.print.assert_called_once() - - @patch('builtins.print') - def test_show_download_progress_no_console_no_holmes(self, mock_print): - """Test download progress fallback when Holmes not available.""" - with patch('holmes.utils.console.logging.init_logging', - side_effect=ImportError("Holmes not available")): - ProgressReporter.show_download_progress( - downloaded=750, - total=1000, - filename="fallback.bin" - ) - - mock_print.assert_called_once() - call_args = mock_print.call_args[0][0] - self.assertIn("Downloading fallback.bin", call_args) - self.assertIn("75.0%", call_args) + def test_show_status_message_all_levels(self): """Test status messages with different levels.""" @@ -121,23 +93,7 @@ def test_show_status_message_all_levels(self): self.mock_console.print.assert_called_once_with(expected_style) - def test_show_status_message_no_console_holmes_available(self): - """Test status message without console but Holmes available.""" - mock_holmes_console = Mock() - - with patch('holmes.utils.console.logging.init_logging', return_value=mock_holmes_console): - ProgressReporter.show_status_message("Test message", "info") - - mock_holmes_console.print.assert_called_once_with("[cyan]Test message[/cyan]") - - @patch('builtins.print') - def test_show_status_message_no_console_no_holmes(self, mock_print): - """Test status message fallback when Holmes not available.""" - with patch('holmes.utils.console.logging.init_logging', - side_effect=ImportError("Holmes not available")): - ProgressReporter.show_status_message("Fallback message", "error") - - mock_print.assert_called_once_with("[ERROR] Fallback message") + def test_show_binary_setup_status(self): """Test binary setup status message.""" @@ -215,4 +171,3 @@ def test_progress_bar_formatting(self): if __name__ == '__main__': unittest.main() - diff --git a/src/aks-agent/azext_aks_agent/tests/latest/test_validators.py b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_validators.py similarity index 99% rename from src/aks-agent/azext_aks_agent/tests/latest/test_validators.py rename to src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_validators.py index 7e366d45f63..dde70a97ee3 100644 --- a/src/aks-agent/azext_aks_agent/tests/latest/test_validators.py +++ b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent_validators.py @@ -2,6 +2,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. See License.txt in the project root for license information. # -------------------------------------------------------------------------------------------- +# This test module was renamed to avoid name collision with 'acs' module tests. import os import shutil import tempfile From 8d28b82205653d0ccac2208c731aacbb57a1b375 Mon Sep 17 00:00:00 2001 From: Qi Ni Date: Thu, 11 Sep 2025 11:49:20 +1000 Subject: [PATCH 07/11] =?UTF-8?q?fix(aks-agent/mcp):=20eliminate=20?= =?UTF-8?q?=E2=80=9CEvent=20loop=20is=20closed=E2=80=9D=20shutdown=20error?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Launch aks-mcp via subprocess.Popen instead of asyncio.create_subprocess_exec to avoid asyncio transport GC on a closed loop. - Add robust teardown: terminate → wait(timeout) → kill fallback, and explicitly close stdin/stdout/stderr pipes. - Make is_server_running use Popen.poll() safely. - Minor: update MCP prompt to prefer kubectl node listing when Azure Compute ops are blocked by read-only policy. --- src/aks-agent/azext_aks_agent/agent/agent.py | 33 ++++- .../azext_aks_agent/agent/mcp_manager.py | 135 ++++++++++++++---- src/aks-agent/azext_aks_agent/agent/prompt.py | 4 + 3 files changed, 140 insertions(+), 32 deletions(-) diff --git a/src/aks-agent/azext_aks_agent/agent/agent.py b/src/aks-agent/azext_aks_agent/agent/agent.py index 784de7e8d62..80e665c026d 100644 --- a/src/aks-agent/azext_aks_agent/agent/agent.py +++ b/src/aks-agent/azext_aks_agent/agent/agent.py @@ -439,6 +439,13 @@ def __init__(self): self.mcp_manager = None self.config = None self._setup_params = None + self._loop = None # dedicated loop for MCP subprocess lifecycle + + def _ensure_loop(self): + """Ensure a dedicated event loop exists for MCP lifecycle.""" + import asyncio + if self._loop is None or self._loop.is_closed(): + self._loop = asyncio.new_event_loop() def setup_mcp_sync(self, config_params): """ @@ -462,7 +469,20 @@ def setup_mcp_sync(self, config_params): :raises: Exception if MCP setup fails """ self._setup_params = config_params - return _run_async_operation(self._setup_mcp_async, config_params) + self._ensure_loop() + try: + return self._loop.run_until_complete(self._setup_mcp_async(config_params)) + except RuntimeError as e: + # In rare cases if a conflicting loop is running in this thread, fall back to thread execution + if "This event loop is already running" in str(e): + import concurrent.futures + def run_in_thread(): + self._ensure_loop() + return self._loop.run_until_complete(self._setup_mcp_async(config_params)) + with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: + future = executor.submit(run_in_thread) + return future.result(timeout=300) + raise def cleanup_mcp_sync(self): """ @@ -473,7 +493,8 @@ def cleanup_mcp_sync(self): """ if self.mcp_manager: try: - _run_async_operation(self._cleanup_mcp_async) + self._ensure_loop() + self._loop.run_until_complete(self._cleanup_mcp_async()) except Exception as e: # pylint: disable=broad-exception-caught # Log cleanup errors but don't re-raise to avoid masking original errors try: @@ -484,6 +505,14 @@ def cleanup_mcp_sync(self): except Exception: # pylint: disable=broad-exception-caught # Fallback to basic logging if ProgressReporter fails print(f"Warning: Error during MCP cleanup: {str(e)}") + finally: + # Close the dedicated loop after cleanup + try: + if self._loop is not None and not self._loop.is_closed(): + self._loop.close() + except Exception: + pass + self._loop = None async def _setup_mcp_async(self, config_params): """ diff --git a/src/aks-agent/azext_aks_agent/agent/mcp_manager.py b/src/aks-agent/azext_aks_agent/agent/mcp_manager.py index 270ed47b08a..b7cffdf2466 100644 --- a/src/aks-agent/azext_aks_agent/agent/mcp_manager.py +++ b/src/aks-agent/azext_aks_agent/agent/mcp_manager.py @@ -125,12 +125,13 @@ async def start_server(self) -> bool: if self.verbose: ProgressReporter.show_status_message(f"Starting MCP server on port {self.server_port}", "info") - # Start the server process + # Start the server using asyncio subprocess with DEVNULL stdio to avoid + # transport pipe cleanup on loop shutdown. self.server_process = await asyncio.create_subprocess_exec( *cmd, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - stdin=subprocess.PIPE + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + stdin=subprocess.DEVNULL, ) # Wait a moment for server to initialize @@ -166,7 +167,11 @@ def stop_server(self) -> None: # pylint: disable=too-many-nested-blocks, too-ma ) # Gracefully terminate the process - self.server_process.terminate() + try: + self.server_process.terminate() + except Exception: # pylint: disable=broad-exception-caught + # Ignore terminate errors and continue with kill path + pass # Wait up to 8 seconds for graceful shutdown import time @@ -175,21 +180,32 @@ def stop_server(self) -> None: # pylint: disable=too-many-nested-blocks, too-ma check_interval = 0.1 last_log_time = start_time # limit verbose logs to once every ~2 seconds + # Poll for process exit (works for both Popen-like and asyncio Process) while (time.time() - start_time) < timeout: time.sleep(check_interval) + rc = None + try: + if hasattr(self.server_process, 'poll'): + rc = self.server_process.poll() + else: + rc = getattr(self.server_process, 'returncode', None) + except Exception: # pylint: disable=broad-exception-caught + rc = 0 # Assume exited - # Check if process is actually still running using OS-level check - # This works even when asyncio event loop is closed + if rc is not None: + if self.verbose: + from .user_feedback import ProgressReporter + elapsed = time.time() - start_time + ProgressReporter.show_status_message( + f"MCP server shut down gracefully in {elapsed:.2f}s", "info" + ) + return + # Fallback: OS-level existence check (handles asyncio Process without active loop) try: if pid != 'unknown': - os.kill(pid, 0) # Signal 0 checks if process exists without killing it - # If we get here, process is still running - else: - # Fallback to asyncio returncode if PID unavailable - if self.server_process.returncode is not None: - break + os.kill(pid, 0) + # If no exception, process still exists except (OSError, ProcessLookupError): - # Process no longer exists - graceful shutdown succeeded! if self.verbose: from .user_feedback import ProgressReporter elapsed = time.time() - start_time @@ -197,7 +213,6 @@ def stop_server(self) -> None: # pylint: disable=too-many-nested-blocks, too-ma f"MCP server shut down gracefully in {elapsed:.2f}s", "info" ) return - # Debug: emit a message at most once every ~2 seconds in verbose mode if self.verbose: now = time.time() @@ -210,34 +225,68 @@ def stop_server(self) -> None: # pylint: disable=too-many-nested-blocks, too-ma # If we get here, timeout was reached and process might still be running try: - if pid != 'unknown': - os.kill(pid, 0) # Check if still running - # Process is still running, need force kill + # If still running, force kill (support both Popen and asyncio Process) + still_running = False + try: + if hasattr(self.server_process, 'poll'): + still_running = self.server_process.poll() is None + else: + still_running = getattr(self.server_process, 'returncode', None) is None + except Exception: # pylint: disable=broad-exception-caught + still_running = False + + if still_running: if self.verbose: from .user_feedback import ProgressReporter ProgressReporter.show_status_message( f"Graceful shutdown timed out after {timeout}s, using force kill", "warning" ) - self.server_process.kill() + try: + # asyncio subprocess has .kill(); Popen has .kill() as well + self.server_process.kill() + except Exception: # pylint: disable=broad-exception-caught + pass # Wait up to 3 seconds for kill to complete kill_start = time.time() while (time.time() - kill_start) < 3.0: time.sleep(0.1) try: - os.kill(pid, 0) - except (OSError, ProcessLookupError): - if self.verbose: - ProgressReporter.show_status_message("MCP server force killed successfully", "info") - return + if hasattr(self.server_process, 'poll'): + if self.server_process.poll() is not None: + if self.verbose: + ProgressReporter.show_status_message("MCP server force killed successfully", "info") + break + else: + if getattr(self.server_process, 'returncode', None) is not None: + if self.verbose: + ProgressReporter.show_status_message("MCP server force killed successfully", "info") + break + except Exception: # pylint: disable=broad-exception-caught + break - if self.verbose: + # If still running, warn + try: + still_running = False + if hasattr(self.server_process, 'poll'): + still_running = self.server_process.poll() is None + else: + still_running = getattr(self.server_process, 'returncode', None) is None + # OS-level last check + if pid != 'unknown': + try: + os.kill(pid, 0) + except (OSError, ProcessLookupError): + still_running = False + except Exception: # pylint: disable=broad-exception-caught + still_running = False + if still_running and self.verbose: ProgressReporter.show_status_message( f"Warning: MCP server (PID: {pid}) may still be running", "warning" ) else: - # Fallback when PID not available + # Already terminated if self.verbose: from .user_feedback import ProgressReporter ProgressReporter.show_status_message( @@ -257,9 +306,26 @@ def stop_server(self) -> None: # pylint: disable=too-many-nested-blocks, too-ma from .user_feedback import ProgressReporter ProgressReporter.show_status_message(f"Error stopping server: {str(e)}", "error") finally: - self.server_process = None - self.server_url = None - self.server_port = None + # Explicitly close pipes to avoid lingering file descriptors + try: + stdin_obj = getattr(self.server_process, "stdin", None) + if stdin_obj is not None: + try: + stdin_obj.close() + except Exception: # pylint: disable=broad-exception-caught + pass + for stream_name in ("stdout", "stderr"): + stream_obj = getattr(self.server_process, stream_name, None) + # StreamReader doesn't have close(); ignore safely + if stream_obj is not None and hasattr(stream_obj, "close"): + try: + stream_obj.close() + except Exception: # pylint: disable=broad-exception-caught + pass + finally: + self.server_process = None + self.server_url = None + self.server_port = None def is_server_running(self) -> bool: """ @@ -272,7 +338,16 @@ def is_server_running(self) -> bool: return False # Check if process is still alive - return self.server_process.returncode is None + try: + # Prefer 'returncode' when available (stable for mocks/tests) + if hasattr(self.server_process, 'returncode'): + return getattr(self.server_process, 'returncode', None) is None + # Otherwise use poll() if available + if hasattr(self.server_process, 'poll'): + return self.server_process.poll() is None + return False + except Exception: # pylint: disable=broad-exception-caught + return False def is_server_healthy(self) -> bool: """ diff --git a/src/aks-agent/azext_aks_agent/agent/prompt.py b/src/aks-agent/azext_aks_agent/agent/prompt.py index 554d72c8d66..629ccb3c6a3 100644 --- a/src/aks-agent/azext_aks_agent/agent/prompt.py +++ b/src/aks-agent/azext_aks_agent/agent/prompt.py @@ -188,6 +188,10 @@ - For memory usage analysis: `kubectl_diagnostics` provides detailed resource consumption data - For storage metrics: `kubectl_diagnostics` can analyze PV/PVC usage and capacity +2. **Node Inventory (Read-Only Safe):** + - To list Kubernetes nodes and their readiness/status, prefer `kubectl get nodes -o wide` (via available Kubernetes tools) over Azure VMSS enumeration. + - If Azure Compute operations are restricted by read-only policy, do not attempt VMSS list/instance operations; use Kubernetes-level listing instead. + **MCP Advantage:** These tools provide context-aware analysis that traditional kubectl/az commands cannot match, offering deeper insights and automated correlation across the entire AKS stack. **Note**: "Cluster" in this context refers to both the Azure-managed AKS cluster AND the Kubernetes resources running within it. MCP tools provide unified access to both layers with enhanced context awareness. From e0fad72bdc933c6e5ff54e008284c3f793505f3c Mon Sep 17 00:00:00 2001 From: Qi Ni Date: Thu, 11 Sep 2025 14:24:43 +1000 Subject: [PATCH 08/11] {AKS} Clarify model parameter (cherry-pick PR #9145) Squashed cherry-pick of PR #9145 commits:\n- clarify model parameter\n- adjust command example to pretty print recommendation\n- fix disallowed html tag in deployment name\n- update examples to use model name as deployment name\n- remove redundant starting space in parameter help\n\nExcluded changes to HISTORY.rst and setup.py as requested. --- src/aks-agent/azext_aks_agent/_help.py | 55 ++++++++++++++++-------- src/aks-agent/azext_aks_agent/_params.py | 5 ++- 2 files changed, 40 insertions(+), 20 deletions(-) diff --git a/src/aks-agent/azext_aks_agent/_help.py b/src/aks-agent/azext_aks_agent/_help.py index 5d9a85f32f2..8e9f3da9238 100644 --- a/src/aks-agent/azext_aks_agent/_help.py +++ b/src/aks-agent/azext_aks_agent/_help.py @@ -26,7 +26,14 @@ short-summary: Name of the resource group. - name: --model type: string - short-summary: Model to use for the LLM. + short-summary: Specify the LLM provider and model or deployment to use for the AI assistant. + long-summary: |- + The --model parameter determines which large language model (LLM) and provider will be used to analyze your cluster. + For OpenAI, use the model name directly (e.g., gpt-4o). + For Azure OpenAI, use `azure/` (e.g., azure/gpt-4.1). + Each provider may require different environment variables and model naming conventions. + For a full list of supported providers, model patterns, and required environment variables, see https://docs.litellm.ai/docs/providers. + Note: For Azure OpenAI, it is recommended to set the deployment name as the model name until https://github.com/BerriAI/litellm/issues/13950 is resolved. - name: --api-key type: string short-summary: API key to use for the LLM (if not given, uses environment variables AZURE_API_KEY, OPENAI_API_KEY). @@ -61,31 +68,17 @@ export AZURE_API_BASE="https://my-azureopenai-service.openai.azure.com/" export AZURE_API_VERSION="2025-01-01-preview" export AZURE_API_KEY="sk-xxx" - az aks agent "Why are my pods not starting?" --name MyManagedCluster --resource-group MyResourceGroup --model azure/my-gpt4.1-deployment + az aks agent "Why are my pods not starting?" --name MyManagedCluster --resource-group MyResourceGroup --model azure/gpt-4.1 - name: Ask about pod issues in the cluster with OpenAI text: |- export OPENAI_API_KEY="sk-xxx" az aks agent "Why are my pods not starting?" --name MyManagedCluster --resource-group MyResourceGroup --model gpt-4o - - name: Run in interactive mode without a question - text: az aks agent "Check the pod status in my cluster" --name MyManagedCluster --resource-group MyResourceGroup --model azure/my-gpt4.1-deployment --api-key "sk-xxx" - - name: Run in non-interactive batch mode - text: az aks agent "Diagnose networking issues" --no-interactive --max-steps 15 --model azure/my-gpt4.1-deployment - - name: Show detailed tool output during analysis - text: az aks agent "Why is my service workload unavailable in namespace workload-ns?" --show-tool-output --model azure/my-gpt4.1-deployment - - name: Use custom configuration file - text: az aks agent "Check kubernetes pod resource usage" --config-file /path/to/custom.yaml --model azure/my-gpt4.1-deployment - - name: Run agent with no echo of the original question - text: az aks agent "What is the status of my cluster?" --no-echo-request --model azure/my-gpt4.1-deployment - - name: Refresh toolsets to get the latest available tools - text: az aks agent "What is the status of my cluster?" --refresh-toolsets --model azure/my-gpt4.1-deployment - - name: Show agent status (MCP readiness) - text: az aks agent --status - name: Run agent with config file text: | - az aks agent "Check kubernetes pod resource usage" --config-file /path/to/custom.yaml + az aks agent "Check kubernetes pod resource usage" --config-file /path/to/custom.yaml --name MyManagedCluster --resource-group MyResourceGroup Here is an example of config file: ```json - model: "gpt-4o" + model: "azure/gpt-4.1" api_key: "..." # define a list of mcp servers, mcp server can be defined mcp_servers: @@ -111,4 +104,30 @@ aks/core: enabled: false ``` + - name: Run in interactive mode without a question + text: az aks agent "Check the pod status in my cluster" --name MyManagedCluster --resource-group MyResourceGroup --model azure/gpt-4.1 --api-key "sk-xxx" + - name: Run in non-interactive batch mode + text: az aks agent "Diagnose networking issues" --no-interactive --max-steps 15 --model azure/gpt-4.1 + - name: Show detailed tool output during analysis + text: az aks agent "Why is my service workload unavailable in namespace workload-ns?" --show-tool-output --model azure/gpt-4.1 + - name: Use custom configuration file + text: az aks agent "Check kubernetes pod resource usage" --config-file /path/to/custom.yaml --model azure/gpt-4.1 + - name: Run agent with no echo of the original question + text: az aks agent "What is the status of my cluster?" --no-echo-request --model azure/gpt-4.1 + - name: Refresh toolsets to get the latest available tools + text: az aks agent "What is the status of my cluster?" --refresh-toolsets --model azure/gpt-4.1 + - name: Show agent status (MCP readiness) + text: az aks agent --status + - name: Run in interactive mode without a question + text: az aks agent "Check the pod status in my cluster" --name MyManagedCluster --resource-group MyResourceGroup --model azure/my-gpt4.1-deployment --api-key "sk-xxx" + - name: Run in non-interactive batch mode + text: az aks agent "Diagnose networking issues" --no-interactive --max-steps 15 --model azure/my-gpt4.1-deployment + - name: Show detailed tool output during analysis + text: az aks agent "Why is my service workload unavailable in namespace workload-ns?" --show-tool-output --model azure/my-gpt4.1-deployment + - name: Use custom configuration file + text: az aks agent "Check kubernetes pod resource usage" --config-file /path/to/custom.yaml --model azure/my-gpt4.1-deployment + - name: Run agent with no echo of the original question + text: az aks agent "What is the status of my cluster?" --no-echo-request --model azure/my-gpt4.1-deployment + - name: Refresh toolsets to get the latest available tools + text: az aks agent "What is the status of my cluster?" --refresh-toolsets --model azure/my-gpt4.1-deployment """ diff --git a/src/aks-agent/azext_aks_agent/_params.py b/src/aks-agent/azext_aks_agent/_params.py index 320d8062bf4..bb80094b3e9 100644 --- a/src/aks-agent/azext_aks_agent/_params.py +++ b/src/aks-agent/azext_aks_agent/_params.py @@ -19,6 +19,7 @@ def load_arguments(self, _): "prompt", nargs='?', help="Ask any question and answer using available tools.", + required=False, ) c.argument( "resource_group_name", @@ -48,12 +49,12 @@ def load_arguments(self, _): ) c.argument( "model", - help="The model to use for the LLM.", + help=" Specify the LLM provider and model or deployment to use for the AI assistant.", required=False, type=str, ) c.argument( - "api-key", + "api_key", help="API key to use for the LLM (if not given, uses environment variables AZURE_API_KEY, OPENAI_API_KEY)", required=False, type=str, From 158e625d2b4e425cba67234c8df37d5d71bd6cbd Mon Sep 17 00:00:00 2001 From: Qi Ni Date: Thu, 11 Sep 2025 14:35:03 +1000 Subject: [PATCH 09/11] chore: Add nilo19 and mainerd to aks agent owners --- .github/CODEOWNERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 450efca18cf..d6bacfad8e5 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -62,6 +62,8 @@ /src/aks-preview/ @andyzhangx @andyliuliming @fumingzhang +/src/aks-agent/ @nilo19 @mainerd + /src/bastion/ @aavalang /src/vm-repair/ @haagha From 7f6ebd836a49979ad04ae383ad091ccc7ced5b3a Mon Sep 17 00:00:00 2001 From: Qi Ni Date: Thu, 11 Sep 2025 14:40:29 +1000 Subject: [PATCH 10/11] chore(aks-agent): fix flake8 issues (E306, E261, W291) --- src/aks-agent/azext_aks_agent/agent/agent.py | 3 ++- src/aks-agent/azext_aks_agent/agent/mcp_manager.py | 12 +++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/aks-agent/azext_aks_agent/agent/agent.py b/src/aks-agent/azext_aks_agent/agent/agent.py index 80e665c026d..32563c37868 100644 --- a/src/aks-agent/azext_aks_agent/agent/agent.py +++ b/src/aks-agent/azext_aks_agent/agent/agent.py @@ -476,6 +476,7 @@ def setup_mcp_sync(self, config_params): # In rare cases if a conflicting loop is running in this thread, fall back to thread execution if "This event loop is already running" in str(e): import concurrent.futures + def run_in_thread(): self._ensure_loop() return self._loop.run_until_complete(self._setup_mcp_async(config_params)) @@ -510,7 +511,7 @@ def cleanup_mcp_sync(self): try: if self._loop is not None and not self._loop.is_closed(): self._loop.close() - except Exception: + except Exception: # pylint: disable=broad-exception-caught pass self._loop = None diff --git a/src/aks-agent/azext_aks_agent/agent/mcp_manager.py b/src/aks-agent/azext_aks_agent/agent/mcp_manager.py index b7cffdf2466..3423c2839b3 100644 --- a/src/aks-agent/azext_aks_agent/agent/mcp_manager.py +++ b/src/aks-agent/azext_aks_agent/agent/mcp_manager.py @@ -224,7 +224,7 @@ def stop_server(self) -> None: # pylint: disable=too-many-nested-blocks, too-ma last_log_time = now # If we get here, timeout was reached and process might still be running - try: + try: # pylint: disable=too-many-nested-blocks # If still running, force kill (support both Popen and asyncio Process) still_running = False try: @@ -256,12 +256,18 @@ def stop_server(self) -> None: # pylint: disable=too-many-nested-blocks, too-ma if hasattr(self.server_process, 'poll'): if self.server_process.poll() is not None: if self.verbose: - ProgressReporter.show_status_message("MCP server force killed successfully", "info") + ProgressReporter.show_status_message( + "MCP server force killed successfully", + "info" + ) break else: if getattr(self.server_process, 'returncode', None) is not None: if self.verbose: - ProgressReporter.show_status_message("MCP server force killed successfully", "info") + ProgressReporter.show_status_message( + "MCP server force killed successfully", + "info" + ) break except Exception: # pylint: disable=broad-exception-caught break From bddd5846c0c5090e44cff032f3e01e593f04c0ab Mon Sep 17 00:00:00 2001 From: Qi Ni Date: Thu, 11 Sep 2025 14:43:31 +1000 Subject: [PATCH 11/11] chore(aks-agent): flake8 E261 fix in mcp_manager.py (two spaces before inline comment) --- src/aks-agent/azext_aks_agent/agent/mcp_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aks-agent/azext_aks_agent/agent/mcp_manager.py b/src/aks-agent/azext_aks_agent/agent/mcp_manager.py index 3423c2839b3..21165c43da9 100644 --- a/src/aks-agent/azext_aks_agent/agent/mcp_manager.py +++ b/src/aks-agent/azext_aks_agent/agent/mcp_manager.py @@ -224,7 +224,7 @@ def stop_server(self) -> None: # pylint: disable=too-many-nested-blocks, too-ma last_log_time = now # If we get here, timeout was reached and process might still be running - try: # pylint: disable=too-many-nested-blocks + try: # pylint: disable=too-many-nested-blocks # If still running, force kill (support both Popen and asyncio Process) still_running = False try: