Skip to content

Commit d7aec54

Browse files
authored
feat(k8s): support promql historical data queries (#82)
Enable users the ability to query historical data (if provided a timeline) from metric tools that use Sysdig Monitor with no data restrictions. - Adds optional ``start/end`` historical query parameters to all Sysdig Monitor ``k8s_list_* tools``, enabling LLMs to query Kubernetes metrics over a past time window instead of only the current snapshot. - When provided, the underlying PromQL is wrapped in the appropriate aggregation for each tool: - CPU/memory/pod count → ``avg_over_time`` - Restarted pods → ``increase`` - Unavailable pods → ``min_over_time >= 1`` - HTTP/network errors → ``sum_over_time / N (rate per second)`` - Inventory tools (clusters, nodes, workloads, etc.) → ``max_over_time > 0`` - When omitted, tools behave as before (instant snapshot)
1 parent 92c2c9c commit d7aec54

40 files changed

Lines changed: 1378 additions & 170 deletions

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,8 @@ The server dynamically filters the available tools based on the permissions asso
131131
- **Required Permission**: `metrics-data.read`
132132
- **Sample Prompt**: "Show the top 10 underutilized pods by memory quota in cluster 'production'"
133133

134+
> **Note:** When a time window is provided, the underlying PromQL is wrapped in the aggregation appropriate for each tool (`avg_over_time`, `max_over_time`, `min_over_time`, `increase`, etc.) and evaluated at `end`. See [`internal/infra/mcp/tools/README.md`](./internal/infra/mcp/tools/README.md) for the per-tool aggregation table.
135+
134136
### Sysdig Secure
135137

136138
- **`list_runtime_events`**

cmd/server/main.go

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -119,22 +119,22 @@ func setupHandler(sysdigClient sysdig.ExtendedClientWithResponsesInterface) *mcp
119119
tools.NewToolRunSysql(sysdigClient),
120120
tools.NewToolGenerateSysql(sysdigClient),
121121

122-
tools.NewK8sListClusters(sysdigClient),
123-
tools.NewK8sListNodes(sysdigClient),
124-
tools.NewK8sListCronjobs(sysdigClient),
125-
tools.NewK8sListWorkloads(sysdigClient),
126-
tools.NewK8sListPodContainers(sysdigClient),
127-
tools.NewK8sListTopUnavailablePods(sysdigClient),
128-
tools.NewK8sListTopRestartedPods(sysdigClient),
129-
tools.NewK8sListTopHttpErrorsInPods(sysdigClient),
130-
tools.NewK8sListTopNetworkErrorsInPods(sysdigClient),
131-
tools.NewK8sListCountPodsPerCluster(sysdigClient),
132-
tools.NewK8sListUnderutilizedPodsCPUQuota(sysdigClient),
133-
tools.NewK8sListTopCPUConsumedWorkload(sysdigClient),
134-
tools.NewK8sListTopCPUConsumedContainer(sysdigClient),
135-
tools.NewK8sListUnderutilizedPodsMemoryQuota(sysdigClient),
136-
tools.NewK8sListTopMemoryConsumedWorkload(sysdigClient),
137-
tools.NewK8sListTopMemoryConsumedContainer(sysdigClient),
122+
tools.NewK8sListClusters(sysdigClient, systemClock),
123+
tools.NewK8sListNodes(sysdigClient, systemClock),
124+
tools.NewK8sListCronjobs(sysdigClient, systemClock),
125+
tools.NewK8sListWorkloads(sysdigClient, systemClock),
126+
tools.NewK8sListPodContainers(sysdigClient, systemClock),
127+
tools.NewK8sListTopUnavailablePods(sysdigClient, systemClock),
128+
tools.NewK8sListTopRestartedPods(sysdigClient, systemClock),
129+
tools.NewK8sListTopHttpErrorsInPods(sysdigClient, systemClock),
130+
tools.NewK8sListTopNetworkErrorsInPods(sysdigClient, systemClock),
131+
tools.NewK8sListCountPodsPerCluster(sysdigClient, systemClock),
132+
tools.NewK8sListUnderutilizedPodsCPUQuota(sysdigClient, systemClock),
133+
tools.NewK8sListTopCPUConsumedWorkload(sysdigClient, systemClock),
134+
tools.NewK8sListTopCPUConsumedContainer(sysdigClient, systemClock),
135+
tools.NewK8sListUnderutilizedPodsMemoryQuota(sysdigClient, systemClock),
136+
tools.NewK8sListTopMemoryConsumedWorkload(sysdigClient, systemClock),
137+
tools.NewK8sListTopMemoryConsumedContainer(sysdigClient, systemClock),
138138
)
139139
return handler
140140
}

internal/infra/mcp/tools/README.md

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,35 @@ The handler filters tools dynamically based on the Sysdig user's permissions. Ea
3838
|---|---|---|---|---|
3939
| `generate_sysql` | `tool_generate_sysql.go` | Convert natural language to SysQL via Sysdig Sage. | `sage.exec` (does not work with Service Accounts) | "Create a SysQL to list S3 buckets." |
4040

41+
## Historical range (start / end)
42+
43+
All Sysdig Monitor `k8s_list_*` tools accept two optional parameters:
44+
45+
- `start` — RFC3339 timestamp, e.g. `2026-04-16T00:00:00Z`
46+
- `end` — RFC3339 timestamp, e.g. `2026-04-16T01:00:00Z`
47+
48+
When omitted, tools return an instant snapshot (current behaviour). When provided,
49+
the underlying PromQL is wrapped in the aggregation appropriate for each tool and
50+
evaluated at `end`:
51+
52+
| Tool group | Wrapping applied when windowed |
53+
|---|---|
54+
| CPU / memory usage, underutilized quota, pod count | `avg_over_time(metric[Ns])` |
55+
| Top restarted pods | `increase(kube_pod_container_status_restarts_total[Ns])` |
56+
| Top unavailable pods | `min_over_time(kube_workload_status_unavailable[Ns]) >= 1` (Sysdig-canonical pattern — requires continuous unavailability for the entire window) |
57+
| HTTP / network errors | `sum_over_time(metric[Ns]) / N` (rate per second) |
58+
| Inventory tools (clusters, nodes, workloads, pod_containers, cronjobs) | `max_over_time(metric[Ns]) > 0` (workloads with status=ready/desired/running drop the `> 0` guard) |
59+
60+
Validation rules (helper: `utils.go`):
61+
62+
- `end` without `start` → error.
63+
- `start` without `end``end` defaults to now.
64+
- `end` in the future → clamped to now.
65+
- `end <= start` → error.
66+
67+
Windowed queries carry a 60 s client-side PromQL `Timeout` to fail fast before the
68+
Sysdig edge proxy's own 80–90 s cut-off.
69+
4170
# Adding a New Tool
4271

4372
1. **See other tools:** Check how other tools are implemented so you can have the context on how they should look like.

internal/infra/mcp/tools/tool_k8s_list_clusters.go

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,27 +8,31 @@ import (
88

99
"github.com/mark3labs/mcp-go/mcp"
1010
"github.com/mark3labs/mcp-go/server"
11+
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/clock"
1112
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/sysdig"
1213
)
1314

1415
type K8sListClusters struct {
1516
SysdigClient sysdig.ExtendedClientWithResponsesInterface
17+
clock clock.Clock
1618
}
1719

18-
func NewK8sListClusters(sysdigClient sysdig.ExtendedClientWithResponsesInterface) *K8sListClusters {
20+
func NewK8sListClusters(sysdigClient sysdig.ExtendedClientWithResponsesInterface, clk clock.Clock) *K8sListClusters {
1921
return &K8sListClusters{
2022
SysdigClient: sysdigClient,
23+
clock: clk,
2124
}
2225
}
2326

2427
func (t *K8sListClusters) RegisterInServer(s *server.MCPServer) {
2528
tool := mcp.NewTool("k8s_list_clusters",
26-
mcp.WithDescription("Lists the cluster information for all clusters or just the cluster specified."),
29+
mcp.WithDescription("Lists the cluster information for all clusters or just the cluster specified. Optionally pass start/end (RFC3339) to list clusters that existed at any point in the window."),
2730
mcp.WithString("cluster_name", mcp.Description("The name of the cluster to filter by.")),
2831
mcp.WithNumber("limit",
2932
mcp.Description("Maximum number of clusters to return."),
3033
mcp.DefaultNumber(10),
3134
),
35+
WithTimeWindowParams(),
3236
mcp.WithOutputSchema[map[string]any](),
3337
mcp.WithReadOnlyHintAnnotation(true),
3438
mcp.WithDestructiveHintAnnotation(false),
@@ -41,16 +45,21 @@ func (t *K8sListClusters) handle(ctx context.Context, request mcp.CallToolReques
4145
clusterName := mcp.ParseString(request, "cluster_name", "")
4246
limit := mcp.ParseInt(request, "limit", 10)
4347

44-
query := "kube_cluster_info"
45-
if clusterName != "" {
46-
query = fmt.Sprintf("kube_cluster_info{cluster=\"%s\"}", clusterName)
48+
tw, err := ParseTimeWindow(request, t.clock)
49+
if err != nil {
50+
return mcp.NewToolResultErrorFromErr("invalid time window", err), nil
4751
}
4852

53+
query := buildKubeClusterInfoQuery(clusterName, tw)
54+
4955
limitQuery := sysdig.LimitQuery(limit)
5056
params := &sysdig.GetQueryV1Params{
5157
Query: query,
5258
Limit: &limitQuery,
5359
}
60+
if err := tw.ApplyToParams(params); err != nil {
61+
return mcp.NewToolResultErrorFromErr("failed to build eval time", err), nil
62+
}
5463

5564
httpResp, err := t.SysdigClient.GetQueryV1(ctx, params)
5665
if err != nil {
@@ -69,3 +78,14 @@ func (t *K8sListClusters) handle(ctx context.Context, request mcp.CallToolReques
6978

7079
return mcp.NewToolResultJSON(queryResponse)
7180
}
81+
82+
func buildKubeClusterInfoQuery(clusterName string, tw TimeWindow) string {
83+
metric := "kube_cluster_info"
84+
if clusterName != "" {
85+
metric = fmt.Sprintf(`kube_cluster_info{cluster="%s"}`, clusterName)
86+
}
87+
if !tw.IsZero() {
88+
return fmt.Sprintf("max_over_time(%s%s) > 0", metric, tw.RangeSelector())
89+
}
90+
return metric
91+
}

internal/infra/mcp/tools/tool_k8s_list_clusters_test.go

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,29 +5,35 @@ import (
55
"context"
66
"io"
77
"net/http"
8+
"time"
89

910
"github.com/mark3labs/mcp-go/mcp"
1011
"github.com/mark3labs/mcp-go/server"
1112
. "github.com/onsi/ginkgo/v2"
1213
. "github.com/onsi/gomega"
14+
"go.uber.org/mock/gomock"
15+
16+
mocks_clock "github.com/sysdiglabs/sysdig-mcp-server/internal/infra/clock/mocks"
1317
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/mcp/tools"
1418
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/sysdig"
1519
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/sysdig/mocks"
16-
"go.uber.org/mock/gomock"
1720
)
1821

1922
var _ = Describe("KubernetesListClusters Tool", func() {
2023
var (
2124
tool *tools.K8sListClusters
2225
mockSysdig *mocks.MockExtendedClientWithResponsesInterface
26+
mockClock *mocks_clock.MockClock
2327
mcpServer *server.MCPServer
2428
ctrl *gomock.Controller
2529
)
2630

2731
BeforeEach(func() {
2832
ctrl = gomock.NewController(GinkgoT())
2933
mockSysdig = mocks.NewMockExtendedClientWithResponsesInterface(ctrl)
30-
tool = tools.NewK8sListClusters(mockSysdig)
34+
mockClock = mocks_clock.NewMockClock(ctrl)
35+
mockClock.EXPECT().Now().AnyTimes().Return(time.Date(2026, time.April, 16, 12, 0, 0, 0, time.UTC))
36+
tool = tools.NewK8sListClusters(mockSysdig, mockClock)
3137
mcpServer = server.NewMCPServer("test", "test")
3238
tool.RegisterInServer(mcpServer)
3339
})
@@ -103,6 +109,39 @@ var _ = Describe("KubernetesListClusters Tool", func() {
103109
Limit: new(sysdig.LimitQuery(20)),
104110
},
105111
),
112+
Entry("windowed, no filters",
113+
"k8s_list_clusters",
114+
mcp.CallToolRequest{
115+
Params: mcp.CallToolParams{
116+
Name: "k8s_list_clusters",
117+
Arguments: map[string]any{
118+
"start": "2026-04-16T10:00:00Z",
119+
"end": "2026-04-16T11:00:00Z",
120+
},
121+
},
122+
},
123+
mergeLimit(newWindowedQueryParams(
124+
`max_over_time(kube_cluster_info[3600s]) > 0`,
125+
time.Date(2026, time.April, 16, 11, 0, 0, 0, time.UTC),
126+
), 10),
127+
),
128+
Entry("windowed, cluster_name filter",
129+
"k8s_list_clusters",
130+
mcp.CallToolRequest{
131+
Params: mcp.CallToolParams{
132+
Name: "k8s_list_clusters",
133+
Arguments: map[string]any{
134+
"cluster_name": "my_cluster",
135+
"start": "2026-04-16T10:00:00Z",
136+
"end": "2026-04-16T11:00:00Z",
137+
},
138+
},
139+
},
140+
mergeLimit(newWindowedQueryParams(
141+
`max_over_time(kube_cluster_info{cluster="my_cluster"}[3600s]) > 0`,
142+
time.Date(2026, time.April, 16, 11, 0, 0, 0, time.UTC),
143+
), 10),
144+
),
106145
)
107146
})
108147
})

internal/infra/mcp/tools/tool_k8s_list_count_pods_per_cluster.go

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,28 +9,32 @@ import (
99

1010
"github.com/mark3labs/mcp-go/mcp"
1111
"github.com/mark3labs/mcp-go/server"
12+
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/clock"
1213
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/sysdig"
1314
)
1415

1516
type K8sListCountPodsPerCluster struct {
1617
SysdigClient sysdig.ExtendedClientWithResponsesInterface
18+
clock clock.Clock
1719
}
1820

19-
func NewK8sListCountPodsPerCluster(sysdigClient sysdig.ExtendedClientWithResponsesInterface) *K8sListCountPodsPerCluster {
21+
func NewK8sListCountPodsPerCluster(sysdigClient sysdig.ExtendedClientWithResponsesInterface, clk clock.Clock) *K8sListCountPodsPerCluster {
2022
return &K8sListCountPodsPerCluster{
2123
SysdigClient: sysdigClient,
24+
clock: clk,
2225
}
2326
}
2427

2528
func (t *K8sListCountPodsPerCluster) RegisterInServer(s *server.MCPServer) {
2629
tool := mcp.NewTool("k8s_list_count_pods_per_cluster",
27-
mcp.WithDescription("List the count of running Kubernetes Pods grouped by cluster and namespace."),
30+
mcp.WithDescription("List the count of running Kubernetes Pods grouped by cluster and namespace. Optionally pass start/end (RFC3339) to count pods averaged over a historical window instead of the current instant snapshot."),
2831
mcp.WithString("cluster_name", mcp.Description("The name of the cluster to filter by.")),
2932
mcp.WithString("namespace_name", mcp.Description("The name of the namespace to filter by.")),
3033
mcp.WithNumber("limit",
3134
mcp.Description("Maximum number of results to return."),
3235
mcp.DefaultNumber(20),
3336
),
37+
WithTimeWindowParams(),
3438
mcp.WithOutputSchema[map[string]any](),
3539
mcp.WithReadOnlyHintAnnotation(true),
3640
mcp.WithDestructiveHintAnnotation(false),
@@ -44,13 +48,21 @@ func (t *K8sListCountPodsPerCluster) handle(ctx context.Context, request mcp.Cal
4448
namespaceName := mcp.ParseString(request, "namespace_name", "")
4549
limit := mcp.ParseInt(request, "limit", 20)
4650

47-
query := buildKubePodCountQuery(clusterName, namespaceName)
51+
tw, err := ParseTimeWindow(request, t.clock)
52+
if err != nil {
53+
return mcp.NewToolResultErrorFromErr("invalid time window", err), nil
54+
}
55+
56+
query := buildKubePodCountQuery(clusterName, namespaceName, tw)
4857

4958
limitQuery := sysdig.LimitQuery(limit)
5059
params := &sysdig.GetQueryV1Params{
5160
Query: query,
5261
Limit: &limitQuery,
5362
}
63+
if err := tw.ApplyToParams(params); err != nil {
64+
return mcp.NewToolResultErrorFromErr("failed to build eval time", err), nil
65+
}
5466

5567
httpResp, err := t.SysdigClient.GetQueryV1(ctx, params)
5668
if err != nil {
@@ -70,7 +82,7 @@ func (t *K8sListCountPodsPerCluster) handle(ctx context.Context, request mcp.Cal
7082
return mcp.NewToolResultJSON(queryResponse)
7183
}
7284

73-
func buildKubePodCountQuery(clusterName, namespaceName string) string {
85+
func buildKubePodCountQuery(clusterName, namespaceName string, tw TimeWindow) string {
7486
filters := []string{}
7587
if clusterName != "" {
7688
filters = append(filters, fmt.Sprintf("kube_cluster_name=\"%s\"", clusterName))
@@ -84,5 +96,10 @@ func buildKubePodCountQuery(clusterName, namespaceName string) string {
8496
filterString = fmt.Sprintf("{%s}", strings.Join(filters, ","))
8597
}
8698

87-
return fmt.Sprintf("sum by (kube_cluster_name, kube_namespace_name) (kube_pod_info%s)", filterString)
99+
metric := fmt.Sprintf("kube_pod_info%s", filterString)
100+
if !tw.IsZero() {
101+
metric = fmt.Sprintf("avg_over_time(%s%s)", metric, tw.RangeSelector())
102+
}
103+
104+
return fmt.Sprintf("sum by (kube_cluster_name, kube_namespace_name) (%s)", metric)
88105
}

internal/infra/mcp/tools/tool_k8s_list_count_pods_per_cluster_test.go

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,29 +5,35 @@ import (
55
"context"
66
"io"
77
"net/http"
8+
"time"
89

910
"github.com/mark3labs/mcp-go/mcp"
1011
"github.com/mark3labs/mcp-go/server"
1112
. "github.com/onsi/ginkgo/v2"
1213
. "github.com/onsi/gomega"
14+
"go.uber.org/mock/gomock"
15+
16+
mocks_clock "github.com/sysdiglabs/sysdig-mcp-server/internal/infra/clock/mocks"
1317
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/mcp/tools"
1418
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/sysdig"
1519
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/sysdig/mocks"
16-
"go.uber.org/mock/gomock"
1720
)
1821

1922
var _ = Describe("KubernetesListCountPodsPerCluster Tool", func() {
2023
var (
2124
tool *tools.K8sListCountPodsPerCluster
2225
mockSysdig *mocks.MockExtendedClientWithResponsesInterface
26+
mockClock *mocks_clock.MockClock
2327
mcpServer *server.MCPServer
2428
ctrl *gomock.Controller
2529
)
2630

2731
BeforeEach(func() {
2832
ctrl = gomock.NewController(GinkgoT())
2933
mockSysdig = mocks.NewMockExtendedClientWithResponsesInterface(ctrl)
30-
tool = tools.NewK8sListCountPodsPerCluster(mockSysdig)
34+
mockClock = mocks_clock.NewMockClock(ctrl)
35+
mockClock.EXPECT().Now().AnyTimes().Return(time.Date(2026, time.April, 16, 12, 0, 0, 0, time.UTC))
36+
tool = tools.NewK8sListCountPodsPerCluster(mockSysdig, mockClock)
3137
mcpServer = server.NewMCPServer("test", "test")
3238
tool.RegisterInServer(mcpServer)
3339
})
@@ -116,6 +122,22 @@ var _ = Describe("KubernetesListCountPodsPerCluster Tool", func() {
116122
Limit: new(sysdig.LimitQuery(20)),
117123
},
118124
),
125+
Entry("windowed, both start and end",
126+
"k8s_list_count_pods_per_cluster",
127+
mcp.CallToolRequest{
128+
Params: mcp.CallToolParams{
129+
Name: "k8s_list_count_pods_per_cluster",
130+
Arguments: map[string]any{
131+
"start": "2026-04-16T10:00:00Z",
132+
"end": "2026-04-16T11:00:00Z",
133+
},
134+
},
135+
},
136+
mergeLimit(newWindowedQueryParams(
137+
`sum by (kube_cluster_name, kube_namespace_name) (avg_over_time(kube_pod_info[3600s]))`,
138+
time.Date(2026, time.April, 16, 11, 0, 0, 0, time.UTC),
139+
), 20),
140+
),
119141
)
120142
})
121143
})

0 commit comments

Comments
 (0)