add interactive AI-powered deubgging tool az aks agent

mainred · mainred · commit a6ff392c4444 · 2025-07-28T12:13:53.000Z
diff --git a/src/aks-preview/azext_aks_preview/_help.py b/src/aks-preview/azext_aks_preview/_help.py
@@ -3850,3 +3850,49 @@
         - name: Show details of a load balancer configuration in table format
           text: az aks loadbalancer show -g MyResourceGroup -n kubernetes --cluster-name MyManagedCluster -o table
 """
+
+helps['aks agent'] = """
+    type: command
+    short-summary: Run AI assistant to analyze and troubleshoot Kubernetes clusters.
+    parameters:
+        - name: prompt
+          type: string
+          short-summary: Ask any question and get an answer using available tools.
+        - name: --api-key
+          type: string
+          short-summary: API key to use for the LLM (if not given, uses environment variables AZURE_API_KEY, OPENAI_API_KEY).
+        - name: --model
+          type: string
+        - name: --config-file
+          type: string
+          short-summary: Path to configuration file.
+        - name: --max-steps
+          type: int
+          short-summary: Maximum number of steps the LLM can take to investigate the issue.
+        - name: --non-interactive
+          type: bool
+          short-summary: Disable interactive mode. When set, the agent will not prompt for input and will run in batch mode.
+        - name: --echo_request
+          type: bool
+          short-summary: Disable echoing back the question provided to AKS Agent in the output.
+        - name: --show-tool-output
+          type: bool
+          short-summary: Show the output of each tool that was called during the analysis.
+        - name: --refresh-toolsets
+          type: bool
+          short-summary: Refresh the toolsets status.
+
+    examples:
+        - name: Ask about pod issues in the cluster
+          text: az aks agent "Why are my pods not starting?"
+        - name: Analyze cluster with specific AI model
+          text: az aks agent "Check cluster health" --model azure/my-gpt4-deployment --api-key sk-xxx
+        - name: Run in non-interactive batch mode
+          text: az aks agent "Diagnose networking issues" --interactive false --max-steps 5
+        - name: Use Azure OpenAI with custom deployment
+          text: az aks agent "Analyze failed deployments" --model azure/my-gpt4-deployment --api-key xxx
+        - name: Show detailed tool output during analysis
+          text: az aks agent "Why is my service unavailable?" --show-tool-output
+        - name: Use custom configuration file
+          text: az aks agent "Check resource usage" --config-file /path/to/custom.config
+"""
diff --git a/src/aks-preview/azext_aks_preview/_params.py b/src/aks-preview/azext_aks_preview/_params.py
@@ -23,6 +23,7 @@
     validate_nat_gateway_idle_timeout,
     validate_nat_gateway_managed_outbound_ip_count,
 )
+from azure.cli.core.api import get_config_dir
 from azure.cli.core.commands.parameters import (
     edge_zone_type,
     file_type,
@@ -2744,6 +2745,58 @@ def load_arguments(self, _):
                 help="Name of the load balancer configuration. Required.",
             )
 
+    with self.argument_context("aks agent") as c:
+        c.positional("prompt", help="Ask any question and answer using available tools.")
+        c.argument(
+            "api_key",
+            default=None,
+            required=False,
+            help="API key to use for the LLM (if not given, uses environment variables AZURE_API_KEY, OPENAI_API_KEY).",
+        )
+        c.argument(
+            "model",
+            default=None,
+            required=False,
+            help="Model to use for the LLM. For example, azure/<deployment_name> for Azure OpenAI, or <model_name> for OpenAI.",
+        )
+        c.argument(
+            "max_steps",
+            type=int,
+            default=10,
+            required=False,
+            help="Maximum number of steps the LLM can take to investigate the issue.",
+        )
+        c.argument(
+            "config_file",
+            type=str,
+            default=os.path.join(get_config_dir(), "aksAgent.config"),
+            required=False,
+            help="Path to the config file.",
+        )
+        c.argument(
+            "no_interactive",
+            help="Disable interactive mode. When set, the agent will not prompt for input and will run in batch mode.",
+            required=False,
+            action='store_true',
+        )
+        c.argument(
+            "no_echo_request",
+            help="Disable echoing back the question provided to AKS Agent in the output.",
+            required=False,
+            action='store_true',
+        )
+        c.argument(
+            "show_tool_output",
+            help="Show the output of each tool that was called.",
+            required=False,
+            action='store_true',
+        )
+        c.argument(
+            "refresh_toolsets",
+            help="Refresh the toolsets status.",
+            required=False,
+            action='store_true',
+        )
 
 def _get_default_install_location(exe_name):
     system = platform.system()
diff --git a/src/aks-preview/azext_aks_preview/commands.py b/src/aks-preview/azext_aks_preview/commands.py
@@ -186,6 +186,7 @@ def load_command_table(self, _):
         g.custom_command(
             "operation-abort", "aks_operation_abort", supports_no_wait=True
         )
+        g.custom_command("agent", "aks_agent")
 
     # AKS maintenance configuration commands
     with self.command_group(
diff --git a/src/aks-preview/azext_aks_preview/custom.py b/src/aks-preview/azext_aks_preview/custom.py
@@ -6,13 +6,17 @@
 # pylint: disable=too-many-lines, disable=broad-except
 import datetime
 import json
+import logging
 import os
 import os.path
+from pathlib import Path
 import platform
-import ssl
+import socket
 import sys
 import threading
 import time
+import typer
+import uuid
 import webbrowser
 
 from azext_aks_preview._client_factory import (
@@ -4333,3 +4337,204 @@ def aks_loadbalancer_rebalance_nodes(
     }
 
     return aks_loadbalancer_rebalance_internal(managed_clusters_client, parameters)
+
+
+def aks_agent(
+        cmd,
+        client,
+        resource_group_name,
+        name,
+        prompt,
+        api_key,
+        model,
+        max_steps,
+        config_file,
+        no_interactive=False,
+        no_echo_request=False,
+        show_tool_output=False,
+        refresh_toolsets=False,
+        ):
+    # add description for the function and variables
+    '''
+    Interact with the AKS agent using a prompt or piped input.
+
+    :param prompt: The prompt to send to the agent.
+    :type prompt: str
+    :param api_key: API key for authentication.
+    :type api_key: str
+    :param model: Model to use for the LLM.
+    :type model: str
+    :param interactive: Whether to run in interactive mode.
+    :type interactive: bool
+    :param max_steps: Maximum number of steps to take.
+    :type max_steps: int
+    :param config_file: Path to the config file.
+    :type config_file: str
+    :param no_interactive: Disable interactive mode.
+    :type no_interactive: bool
+    :param no_echo_request: Disable echoing back the question provided to AKS Agent in the output.
+    :type no_echo_request: bool
+    :param show_tool_output: Whether to show tool output.
+    :type show_tool_output: bool
+    :param refresh_toolsets: Refresh the toolsets status.
+    :type refresh_toolsets: bool
+    '''
+
+    # reverse the value of the variables so that 
+    interactive = not no_interactive
+    echo = not no_echo_request
+
+    # Holmes library allows the user to specify the agent name through environment variable before loading the library.
+    os.environ["AGENT_NAME"] = "AKS AGENT"
+    # NOTE(mainred): we need to disable INFO logs from LiteLLM before LiteLLM library is loaded, to avoid logging the debug logs from heading of LiteLLM.
+    logging.getLogger("LiteLLM").setLevel(logging.WARNING)
+    from holmes.config import Config
+    from holmes.core.prompt import build_initial_ask_messages
+    from holmes.interactive import run_interactive_loop
+    from holmes.plugins.destinations import DestinationType
+    from holmes.plugins.interfaces import Issue
+    from holmes.plugins.prompts import load_and_render_prompt
+    from holmes.utils.console.logging import init_logging
+    from holmes.utils.console.result import handle_result
+
+    # NOTE(mainred): holmes leverage the log handler RichHandler to provide colorful, readable and well-formatted logs
+    # making the interactive mode more user-friendly.
+    # And we removed exising log handlers to avoid duplicate logs.
+    # Also make the console log consistent, we remove the telemetry and data logger to skip redundant logs.
+    def init_log():
+        logging.getLogger("telemetry.main").setLevel(logging.WARNING)
+        logging.getLogger("telemetry.process").setLevel(logging.WARNING)
+        logging.getLogger("telemetry.save").setLevel(logging.WARNING)
+        logging.getLogger("telemetry.client").setLevel(logging.WARNING)
+        logging.getLogger("az_command_data_logger").setLevel(logging.WARNING)
+        # TODO: make log verbose configurable, currently disbled by [].
+        return init_logging([])
+
+    console = init_log()
+
+    # Detect and read piped input
+    piped_data = None
+    if not sys.stdin.isatty():
+        piped_data = sys.stdin.read().strip()
+        if interactive:
+            console.print(
+                "[bold yellow]Interactive mode disabled when reading piped input[/bold yellow]"
+            )
+            interactive = False
+
+    config_file = Path(config_file)
+    config = Config.load_from_file(
+        config_file,
+        api_key=api_key,
+        model=model,
+        max_steps=max_steps,
+    )
+
+    ai = config.create_console_toolcalling_llm(
+        dal=None,
+        refresh_toolsets=refresh_toolsets,
+    )
+    template_context = {
+        "toolsets": ai.tool_executor.toolsets,
+        "runbooks": config.get_runbook_catalog(),
+    }
+
+    if not prompt and not interactive and not piped_data:
+        raise typer.BadParameter(
+            "Either the 'prompt' argument must be provided (unless using --interactive mode)."
+        )
+
+    # Handle piped data
+    if piped_data:
+        if prompt:
+            # User provided both piped data and a prompt
+            prompt = f"Here's some piped output:\n\n{piped_data}\n\n{prompt}"
+        else:
+            # Only piped data, no prompt - ask what to do with it
+            prompt = f"Here's some piped output:\n\n{piped_data}\n\nWhat can you tell me about this output?"
+
+    if echo and not interactive and prompt:
+        console.print("[bold yellow]User:[/bold yellow] " + prompt)
+
+    # TODO: extend the system prompt with AKS context
+    system_prompt= "builtin://generic_ask.jinja2"
+    system_prompt_rendered = load_and_render_prompt(system_prompt, template_context)
+
+    subscription_id = get_subscription_id(cmd.cli_ctx)
+
+    aks_template_context = {
+        "cluster_name": name,
+        "resource_group": resource_group_name,
+        "subscription_id": subscription_id,
+    }
+
+    aks_context_prompt = """
+# Azure Kubernetes Service (AKS)
+
+You are specifically working with Azure Kubernetes Service (AKS) clusters. All investigations and troubleshooting should be performed on the AKS cluster. When troubleshooting AKS, you should consider both Azure resources and Kubernetes resources.
+
+The current provided AKS context is as follow:
+cluster_name: {{cluster_name}}
+resource_group: {{resource_group}}
+subscription_id: {{subscription_id}}
+
+## Prerequisites
+### AKS cluster name is under the resource group and subscription specified
+
+You should check if the AKS cluster {{cluster_name}} can be found under resource group {{resource_group}} and subscription {{subscription_id}}.
+If not, you should prompt to the user to specify correct cluster name, resource group and subscription ID.
+
+## AKS cluster is in the current kubeconfig context
+If the current kubeconfig context is not set to the AKS cluster {{cluster_name}}, you should download the kubeconfig credential with the cluster name {{cluster_name}}, resource group name {{resource_group}} and subscription ID {{subscription_id}}.
+If the current kubeconfig context is set to the AKS cluster {{cluster_name}}, you should proceed with the investigation and troubleshooting.
+"""
+    aks_context_prompt = load_and_render_prompt(aks_context_prompt, aks_template_context)
+    system_prompt_rendered += aks_context_prompt
+
+    ## Variables not exposed to the user.
+    # Adds a prompt for post processing.
+    post_processing_prompt = None
+    # File to append to prompt
+    include_file = None
+    # TODO: add refresh-toolset to refresh the toolset if it has changed
+    if interactive:
+        run_interactive_loop(
+            ai,
+            console,
+            system_prompt_rendered,
+            prompt,
+            post_processing_prompt,
+            include_file,
+            show_tool_output=show_tool_output,
+        )
+        return
+
+    messages = build_initial_ask_messages(
+        console,
+        system_prompt_rendered,
+        prompt,
+        include_file,
+    )
+
+    response = ai.call(messages)
+
+
+    messages = response.messages  # type: ignore # Update messages with the full history
+
+    issue = Issue(
+        id=str(uuid.uuid4()),
+        name=prompt, 
+        source_type="holmes-ask",
+        raw={"prompt": prompt, "full_conversation": messages},
+        source_instance_id=socket.gethostname(),
+    )
+    handle_result(
+        response,
+        console,
+        DestinationType.CLI, 
+        config,
+        issue,
+        show_tool_output,
+        False, 
+    )
+
diff --git a/src/aks-preview/setup.py b/src/aks-preview/setup.py
@@ -7,7 +7,7 @@
 
 from codecs import open as open1
 
-from setuptools import setup, find_packages
+from setuptools import find_packages, setup
 
 VERSION = "18.0.0b18"
 
@@ -23,7 +23,9 @@
     "License :: OSI Approved :: MIT License",
 ]
 
-DEPENDENCIES = []
+DEPENDENCIES = [
+    "holmesgpt @ git+ssh://git@github.com/robusta-dev/holmesgpt@dont_hardcode_agent_name", # will use a official pypi package once available
+]
 
 with open1("README.rst", "r", encoding="utf-8") as f:
     README = f.read()

Original file line number	Diff line number	Diff line change
`@@ -186,6 +186,7 @@ def load_command_table(self, _):`
`186`	`186`	`g.custom_command(`
`187`	`187`	`"operation-abort", "aks_operation_abort", supports_no_wait=True`
`188`	`188`	`)`
	`189`	`+ g.custom_command("agent", "aks_agent")`
`189`	`190`
`190`	`191`	`# AKS maintenance configuration commands`
`191`	`192`	`with self.command_group(`