Skip to content

Commit a6ff392

Browse files
committed
add interactive AI-powered deubgging tool az aks agent
1 parent 7fd5e48 commit a6ff392

5 files changed

Lines changed: 310 additions & 3 deletions

File tree

src/aks-preview/azext_aks_preview/_help.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3850,3 +3850,49 @@
38503850
- name: Show details of a load balancer configuration in table format
38513851
text: az aks loadbalancer show -g MyResourceGroup -n kubernetes --cluster-name MyManagedCluster -o table
38523852
"""
3853+
3854+
helps['aks agent'] = """
3855+
type: command
3856+
short-summary: Run AI assistant to analyze and troubleshoot Kubernetes clusters.
3857+
parameters:
3858+
- name: prompt
3859+
type: string
3860+
short-summary: Ask any question and get an answer using available tools.
3861+
- name: --api-key
3862+
type: string
3863+
short-summary: API key to use for the LLM (if not given, uses environment variables AZURE_API_KEY, OPENAI_API_KEY).
3864+
- name: --model
3865+
type: string
3866+
- name: --config-file
3867+
type: string
3868+
short-summary: Path to configuration file.
3869+
- name: --max-steps
3870+
type: int
3871+
short-summary: Maximum number of steps the LLM can take to investigate the issue.
3872+
- name: --non-interactive
3873+
type: bool
3874+
short-summary: Disable interactive mode. When set, the agent will not prompt for input and will run in batch mode.
3875+
- name: --echo_request
3876+
type: bool
3877+
short-summary: Disable echoing back the question provided to AKS Agent in the output.
3878+
- name: --show-tool-output
3879+
type: bool
3880+
short-summary: Show the output of each tool that was called during the analysis.
3881+
- name: --refresh-toolsets
3882+
type: bool
3883+
short-summary: Refresh the toolsets status.
3884+
3885+
examples:
3886+
- name: Ask about pod issues in the cluster
3887+
text: az aks agent "Why are my pods not starting?"
3888+
- name: Analyze cluster with specific AI model
3889+
text: az aks agent "Check cluster health" --model azure/my-gpt4-deployment --api-key sk-xxx
3890+
- name: Run in non-interactive batch mode
3891+
text: az aks agent "Diagnose networking issues" --interactive false --max-steps 5
3892+
- name: Use Azure OpenAI with custom deployment
3893+
text: az aks agent "Analyze failed deployments" --model azure/my-gpt4-deployment --api-key xxx
3894+
- name: Show detailed tool output during analysis
3895+
text: az aks agent "Why is my service unavailable?" --show-tool-output
3896+
- name: Use custom configuration file
3897+
text: az aks agent "Check resource usage" --config-file /path/to/custom.config
3898+
"""

src/aks-preview/azext_aks_preview/_params.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
validate_nat_gateway_idle_timeout,
2424
validate_nat_gateway_managed_outbound_ip_count,
2525
)
26+
from azure.cli.core.api import get_config_dir
2627
from azure.cli.core.commands.parameters import (
2728
edge_zone_type,
2829
file_type,
@@ -2744,6 +2745,58 @@ def load_arguments(self, _):
27442745
help="Name of the load balancer configuration. Required.",
27452746
)
27462747

2748+
with self.argument_context("aks agent") as c:
2749+
c.positional("prompt", help="Ask any question and answer using available tools.")
2750+
c.argument(
2751+
"api_key",
2752+
default=None,
2753+
required=False,
2754+
help="API key to use for the LLM (if not given, uses environment variables AZURE_API_KEY, OPENAI_API_KEY).",
2755+
)
2756+
c.argument(
2757+
"model",
2758+
default=None,
2759+
required=False,
2760+
help="Model to use for the LLM. For example, azure/<deployment_name> for Azure OpenAI, or <model_name> for OpenAI.",
2761+
)
2762+
c.argument(
2763+
"max_steps",
2764+
type=int,
2765+
default=10,
2766+
required=False,
2767+
help="Maximum number of steps the LLM can take to investigate the issue.",
2768+
)
2769+
c.argument(
2770+
"config_file",
2771+
type=str,
2772+
default=os.path.join(get_config_dir(), "aksAgent.config"),
2773+
required=False,
2774+
help="Path to the config file.",
2775+
)
2776+
c.argument(
2777+
"no_interactive",
2778+
help="Disable interactive mode. When set, the agent will not prompt for input and will run in batch mode.",
2779+
required=False,
2780+
action='store_true',
2781+
)
2782+
c.argument(
2783+
"no_echo_request",
2784+
help="Disable echoing back the question provided to AKS Agent in the output.",
2785+
required=False,
2786+
action='store_true',
2787+
)
2788+
c.argument(
2789+
"show_tool_output",
2790+
help="Show the output of each tool that was called.",
2791+
required=False,
2792+
action='store_true',
2793+
)
2794+
c.argument(
2795+
"refresh_toolsets",
2796+
help="Refresh the toolsets status.",
2797+
required=False,
2798+
action='store_true',
2799+
)
27472800

27482801
def _get_default_install_location(exe_name):
27492802
system = platform.system()

src/aks-preview/azext_aks_preview/commands.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,7 @@ def load_command_table(self, _):
186186
g.custom_command(
187187
"operation-abort", "aks_operation_abort", supports_no_wait=True
188188
)
189+
g.custom_command("agent", "aks_agent")
189190

190191
# AKS maintenance configuration commands
191192
with self.command_group(

src/aks-preview/azext_aks_preview/custom.py

Lines changed: 206 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,17 @@
66
# pylint: disable=too-many-lines, disable=broad-except
77
import datetime
88
import json
9+
import logging
910
import os
1011
import os.path
12+
from pathlib import Path
1113
import platform
12-
import ssl
14+
import socket
1315
import sys
1416
import threading
1517
import time
18+
import typer
19+
import uuid
1620
import webbrowser
1721

1822
from azext_aks_preview._client_factory import (
@@ -4333,3 +4337,204 @@ def aks_loadbalancer_rebalance_nodes(
43334337
}
43344338

43354339
return aks_loadbalancer_rebalance_internal(managed_clusters_client, parameters)
4340+
4341+
4342+
def aks_agent(
4343+
cmd,
4344+
client,
4345+
resource_group_name,
4346+
name,
4347+
prompt,
4348+
api_key,
4349+
model,
4350+
max_steps,
4351+
config_file,
4352+
no_interactive=False,
4353+
no_echo_request=False,
4354+
show_tool_output=False,
4355+
refresh_toolsets=False,
4356+
):
4357+
# add description for the function and variables
4358+
'''
4359+
Interact with the AKS agent using a prompt or piped input.
4360+
4361+
:param prompt: The prompt to send to the agent.
4362+
:type prompt: str
4363+
:param api_key: API key for authentication.
4364+
:type api_key: str
4365+
:param model: Model to use for the LLM.
4366+
:type model: str
4367+
:param interactive: Whether to run in interactive mode.
4368+
:type interactive: bool
4369+
:param max_steps: Maximum number of steps to take.
4370+
:type max_steps: int
4371+
:param config_file: Path to the config file.
4372+
:type config_file: str
4373+
:param no_interactive: Disable interactive mode.
4374+
:type no_interactive: bool
4375+
:param no_echo_request: Disable echoing back the question provided to AKS Agent in the output.
4376+
:type no_echo_request: bool
4377+
:param show_tool_output: Whether to show tool output.
4378+
:type show_tool_output: bool
4379+
:param refresh_toolsets: Refresh the toolsets status.
4380+
:type refresh_toolsets: bool
4381+
'''
4382+
4383+
# reverse the value of the variables so that
4384+
interactive = not no_interactive
4385+
echo = not no_echo_request
4386+
4387+
# Holmes library allows the user to specify the agent name through environment variable before loading the library.
4388+
os.environ["AGENT_NAME"] = "AKS AGENT"
4389+
# NOTE(mainred): we need to disable INFO logs from LiteLLM before LiteLLM library is loaded, to avoid logging the debug logs from heading of LiteLLM.
4390+
logging.getLogger("LiteLLM").setLevel(logging.WARNING)
4391+
from holmes.config import Config
4392+
from holmes.core.prompt import build_initial_ask_messages
4393+
from holmes.interactive import run_interactive_loop
4394+
from holmes.plugins.destinations import DestinationType
4395+
from holmes.plugins.interfaces import Issue
4396+
from holmes.plugins.prompts import load_and_render_prompt
4397+
from holmes.utils.console.logging import init_logging
4398+
from holmes.utils.console.result import handle_result
4399+
4400+
# NOTE(mainred): holmes leverage the log handler RichHandler to provide colorful, readable and well-formatted logs
4401+
# making the interactive mode more user-friendly.
4402+
# And we removed exising log handlers to avoid duplicate logs.
4403+
# Also make the console log consistent, we remove the telemetry and data logger to skip redundant logs.
4404+
def init_log():
4405+
logging.getLogger("telemetry.main").setLevel(logging.WARNING)
4406+
logging.getLogger("telemetry.process").setLevel(logging.WARNING)
4407+
logging.getLogger("telemetry.save").setLevel(logging.WARNING)
4408+
logging.getLogger("telemetry.client").setLevel(logging.WARNING)
4409+
logging.getLogger("az_command_data_logger").setLevel(logging.WARNING)
4410+
# TODO: make log verbose configurable, currently disbled by [].
4411+
return init_logging([])
4412+
4413+
console = init_log()
4414+
4415+
# Detect and read piped input
4416+
piped_data = None
4417+
if not sys.stdin.isatty():
4418+
piped_data = sys.stdin.read().strip()
4419+
if interactive:
4420+
console.print(
4421+
"[bold yellow]Interactive mode disabled when reading piped input[/bold yellow]"
4422+
)
4423+
interactive = False
4424+
4425+
config_file = Path(config_file)
4426+
config = Config.load_from_file(
4427+
config_file,
4428+
api_key=api_key,
4429+
model=model,
4430+
max_steps=max_steps,
4431+
)
4432+
4433+
ai = config.create_console_toolcalling_llm(
4434+
dal=None,
4435+
refresh_toolsets=refresh_toolsets,
4436+
)
4437+
template_context = {
4438+
"toolsets": ai.tool_executor.toolsets,
4439+
"runbooks": config.get_runbook_catalog(),
4440+
}
4441+
4442+
if not prompt and not interactive and not piped_data:
4443+
raise typer.BadParameter(
4444+
"Either the 'prompt' argument must be provided (unless using --interactive mode)."
4445+
)
4446+
4447+
# Handle piped data
4448+
if piped_data:
4449+
if prompt:
4450+
# User provided both piped data and a prompt
4451+
prompt = f"Here's some piped output:\n\n{piped_data}\n\n{prompt}"
4452+
else:
4453+
# Only piped data, no prompt - ask what to do with it
4454+
prompt = f"Here's some piped output:\n\n{piped_data}\n\nWhat can you tell me about this output?"
4455+
4456+
if echo and not interactive and prompt:
4457+
console.print("[bold yellow]User:[/bold yellow] " + prompt)
4458+
4459+
# TODO: extend the system prompt with AKS context
4460+
system_prompt= "builtin://generic_ask.jinja2"
4461+
system_prompt_rendered = load_and_render_prompt(system_prompt, template_context)
4462+
4463+
subscription_id = get_subscription_id(cmd.cli_ctx)
4464+
4465+
aks_template_context = {
4466+
"cluster_name": name,
4467+
"resource_group": resource_group_name,
4468+
"subscription_id": subscription_id,
4469+
}
4470+
4471+
aks_context_prompt = """
4472+
# Azure Kubernetes Service (AKS)
4473+
4474+
You are specifically working with Azure Kubernetes Service (AKS) clusters. All investigations and troubleshooting should be performed on the AKS cluster. When troubleshooting AKS, you should consider both Azure resources and Kubernetes resources.
4475+
4476+
The current provided AKS context is as follow:
4477+
cluster_name: {{cluster_name}}
4478+
resource_group: {{resource_group}}
4479+
subscription_id: {{subscription_id}}
4480+
4481+
## Prerequisites
4482+
### AKS cluster name is under the resource group and subscription specified
4483+
4484+
You should check if the AKS cluster {{cluster_name}} can be found under resource group {{resource_group}} and subscription {{subscription_id}}.
4485+
If not, you should prompt to the user to specify correct cluster name, resource group and subscription ID.
4486+
4487+
## AKS cluster is in the current kubeconfig context
4488+
If the current kubeconfig context is not set to the AKS cluster {{cluster_name}}, you should download the kubeconfig credential with the cluster name {{cluster_name}}, resource group name {{resource_group}} and subscription ID {{subscription_id}}.
4489+
If the current kubeconfig context is set to the AKS cluster {{cluster_name}}, you should proceed with the investigation and troubleshooting.
4490+
"""
4491+
aks_context_prompt = load_and_render_prompt(aks_context_prompt, aks_template_context)
4492+
system_prompt_rendered += aks_context_prompt
4493+
4494+
## Variables not exposed to the user.
4495+
# Adds a prompt for post processing.
4496+
post_processing_prompt = None
4497+
# File to append to prompt
4498+
include_file = None
4499+
# TODO: add refresh-toolset to refresh the toolset if it has changed
4500+
if interactive:
4501+
run_interactive_loop(
4502+
ai,
4503+
console,
4504+
system_prompt_rendered,
4505+
prompt,
4506+
post_processing_prompt,
4507+
include_file,
4508+
show_tool_output=show_tool_output,
4509+
)
4510+
return
4511+
4512+
messages = build_initial_ask_messages(
4513+
console,
4514+
system_prompt_rendered,
4515+
prompt,
4516+
include_file,
4517+
)
4518+
4519+
response = ai.call(messages)
4520+
4521+
4522+
messages = response.messages # type: ignore # Update messages with the full history
4523+
4524+
issue = Issue(
4525+
id=str(uuid.uuid4()),
4526+
name=prompt,
4527+
source_type="holmes-ask",
4528+
raw={"prompt": prompt, "full_conversation": messages},
4529+
source_instance_id=socket.gethostname(),
4530+
)
4531+
handle_result(
4532+
response,
4533+
console,
4534+
DestinationType.CLI,
4535+
config,
4536+
issue,
4537+
show_tool_output,
4538+
False,
4539+
)
4540+

src/aks-preview/setup.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
from codecs import open as open1
99

10-
from setuptools import setup, find_packages
10+
from setuptools import find_packages, setup
1111

1212
VERSION = "18.0.0b18"
1313

@@ -23,7 +23,9 @@
2323
"License :: OSI Approved :: MIT License",
2424
]
2525

26-
DEPENDENCIES = []
26+
DEPENDENCIES = [
27+
"holmesgpt @ git+ssh://git@github.com/robusta-dev/holmesgpt@dont_hardcode_agent_name", # will use a official pypi package once available
28+
]
2729

2830
with open1("README.rst", "r", encoding="utf-8") as f:
2931
README = f.read()

0 commit comments

Comments
 (0)