Skip to content

Commit bcf0958

Browse files
committed
feat: add start/end parameters for historical queries on k8s_list_* tools
1 parent 92c2c9c commit bcf0958

38 files changed

Lines changed: 1552 additions & 165 deletions

README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,20 @@ You can also set the following variables to override the default configuration:
180180
- `SYSDIG_MCP_LISTENING_PORT`: The port for the server when it is deployed using remote protocols (`streamable-http`, `sse`). Defaults to: `8080`
181181
- `SYSDIG_MCP_LISTENING_HOST`: The host for the server when it is deployed using remote protocols (`streamable-http`, `sse`). Defaults to all interfaces (`:port`). Set to `127.0.0.1` for local-only access.
182182
- `SYSDIG_MCP_STATELESS`: Enable stateless mode for `streamable-http` transport, where each request is self-contained with no session tracking (useful for AWS Bedrock AgentCore). Defaults to: `false`.
183+
- `SYSDIG_MCP_MAX_INTERVAL`: Maximum historical window accepted by the `k8s_list_*` Monitor tools when `start`/`end` are supplied. Go duration string (e.g. `24h`, `168h`). Defaults to: `168h` (7 days).
184+
185+
### Historical range on Monitor tools
186+
187+
All Sysdig Monitor `k8s_list_*` tools accept optional `start` / `end` RFC3339 parameters
188+
(e.g. `2026-04-16T00:00:00Z`). When omitted, tools return the current snapshot (unchanged
189+
behaviour). When provided, the underlying PromQL is wrapped in the aggregation appropriate
190+
for each tool (`avg_over_time`, `max_over_time`, `min_over_time`, `increase`, etc.) and
191+
evaluated at `end`. The window cannot exceed `SYSDIG_MCP_MAX_INTERVAL`. See
192+
[`internal/infra/mcp/tools/README.md`](./internal/infra/mcp/tools/README.md) for the
193+
per-tool aggregation table.
194+
195+
The legacy `interval` parameter on `k8s_list_top_http_errors_in_pods` and
196+
`k8s_list_top_network_errors_in_pods` is deprecated; prefer `start`/`end`.
183197

184198
You can find your API token in the Sysdig UI under **Settings > Sysdig Secure API** (or **Sysdig Monitor API**). Make sure to copy the token as it will not be shown again.
185199

cmd/server/main.go

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -119,22 +119,22 @@ func setupHandler(sysdigClient sysdig.ExtendedClientWithResponsesInterface) *mcp
119119
tools.NewToolRunSysql(sysdigClient),
120120
tools.NewToolGenerateSysql(sysdigClient),
121121

122-
tools.NewK8sListClusters(sysdigClient),
123-
tools.NewK8sListNodes(sysdigClient),
124-
tools.NewK8sListCronjobs(sysdigClient),
125-
tools.NewK8sListWorkloads(sysdigClient),
126-
tools.NewK8sListPodContainers(sysdigClient),
127-
tools.NewK8sListTopUnavailablePods(sysdigClient),
128-
tools.NewK8sListTopRestartedPods(sysdigClient),
129-
tools.NewK8sListTopHttpErrorsInPods(sysdigClient),
130-
tools.NewK8sListTopNetworkErrorsInPods(sysdigClient),
131-
tools.NewK8sListCountPodsPerCluster(sysdigClient),
132-
tools.NewK8sListUnderutilizedPodsCPUQuota(sysdigClient),
133-
tools.NewK8sListTopCPUConsumedWorkload(sysdigClient),
134-
tools.NewK8sListTopCPUConsumedContainer(sysdigClient),
135-
tools.NewK8sListUnderutilizedPodsMemoryQuota(sysdigClient),
136-
tools.NewK8sListTopMemoryConsumedWorkload(sysdigClient),
137-
tools.NewK8sListTopMemoryConsumedContainer(sysdigClient),
122+
tools.NewK8sListClusters(sysdigClient, systemClock),
123+
tools.NewK8sListNodes(sysdigClient, systemClock),
124+
tools.NewK8sListCronjobs(sysdigClient, systemClock),
125+
tools.NewK8sListWorkloads(sysdigClient, systemClock),
126+
tools.NewK8sListPodContainers(sysdigClient, systemClock),
127+
tools.NewK8sListTopUnavailablePods(sysdigClient, systemClock),
128+
tools.NewK8sListTopRestartedPods(sysdigClient, systemClock),
129+
tools.NewK8sListTopHttpErrorsInPods(sysdigClient, systemClock),
130+
tools.NewK8sListTopNetworkErrorsInPods(sysdigClient, systemClock),
131+
tools.NewK8sListCountPodsPerCluster(sysdigClient, systemClock),
132+
tools.NewK8sListUnderutilizedPodsCPUQuota(sysdigClient, systemClock),
133+
tools.NewK8sListTopCPUConsumedWorkload(sysdigClient, systemClock),
134+
tools.NewK8sListTopCPUConsumedContainer(sysdigClient, systemClock),
135+
tools.NewK8sListUnderutilizedPodsMemoryQuota(sysdigClient, systemClock),
136+
tools.NewK8sListTopMemoryConsumedWorkload(sysdigClient, systemClock),
137+
tools.NewK8sListTopMemoryConsumedContainer(sysdigClient, systemClock),
138138
)
139139
return handler
140140
}

internal/infra/mcp/tools/README.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,41 @@ The handler filters tools dynamically based on the Sysdig user's permissions. Ea
3838
|---|---|---|---|---|
3939
| `generate_sysql` | `tool_generate_sysql.go` | Convert natural language to SysQL via Sysdig Sage. | `sage.exec` (does not work with Service Accounts) | "Create a SysQL to list S3 buckets." |
4040

41+
## Historical range (start / end)
42+
43+
All Sysdig Monitor `k8s_list_*` tools accept two optional parameters:
44+
45+
- `start` — RFC3339 timestamp, e.g. `2026-04-16T00:00:00Z`
46+
- `end` — RFC3339 timestamp, e.g. `2026-04-16T01:00:00Z`
47+
48+
When omitted, tools return an instant snapshot (current behaviour). When provided,
49+
the underlying PromQL is wrapped in the aggregation appropriate for each tool and
50+
evaluated at `end`:
51+
52+
| Tool group | Wrapping applied when windowed |
53+
|---|---|
54+
| CPU / memory usage, underutilized quota, pod count | `avg_over_time(metric[Ns])` |
55+
| Top restarted pods | `increase(kube_pod_container_status_restarts_total[Ns])` |
56+
| Top unavailable pods | `min_over_time(kube_workload_status_unavailable[Ns]) >= 1` (Sysdig-canonical pattern — requires continuous unavailability for the entire window) |
57+
| HTTP / network errors | `sum_over_time(metric[Ns]) / N` (rate per second) |
58+
| Inventory tools (clusters, nodes, workloads, pod_containers, cronjobs) | `max_over_time(metric[Ns]) > 0` (workloads with status=ready/desired/running drop the `> 0` guard) |
59+
60+
Validation rules (helper: `time_window.go`):
61+
62+
- `end` without `start` → error.
63+
- `start` without `end``end` defaults to now.
64+
- `end <= start` → error.
65+
- `end > now + 60s` → error (60 s grace for client clock skew).
66+
- `end - start > SYSDIG_MCP_MAX_INTERVAL` (default **168 h / 7 d**) → error.
67+
68+
Windowed queries carry a 60 s client-side PromQL `Timeout` to fail fast before the
69+
Sysdig edge proxy's own 80–90 s cut-off.
70+
71+
The `interval` parameter on `k8s_list_top_http_errors_in_pods` and
72+
`k8s_list_top_network_errors_in_pods` is deprecated; `start`/`end` take precedence
73+
when both are present. An explicit `interval` emits a deprecation warning to the
74+
server log.
75+
4176
# Adding a New Tool
4277

4378
1. **See other tools:** Check how other tools are implemented so you can have the context on how they should look like.
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
package tools_test
2+
3+
import (
4+
"time"
5+
6+
. "github.com/onsi/gomega"
7+
8+
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/sysdig"
9+
)
10+
11+
// newWindowedQueryParams constructs the GetQueryV1Params value that a windowed tool
12+
// invocation is expected to produce: Query string, Time = end.Unix() via FromQueryTime1,
13+
// and a 60s Timeout. Panics via Gomega if building the QueryTime fails — this only runs
14+
// in tests so the panic is the shortest useful path.
15+
func newWindowedQueryParams(query string, end time.Time) sysdig.GetQueryV1Params {
16+
var qt sysdig.Time
17+
Expect(qt.FromQueryTime1(end.Unix())).To(Succeed())
18+
timeout := sysdig.Timeout("60s")
19+
return sysdig.GetQueryV1Params{
20+
Query: query,
21+
Time: &qt,
22+
Timeout: &timeout,
23+
}
24+
}
25+
26+
// mergeLimit attaches a Limit field to an existing GetQueryV1Params value.
27+
// Used by tools that set params.Limit (memory_*, count_pods, underutilized_*).
28+
func mergeLimit(p sysdig.GetQueryV1Params, limit int) sysdig.GetQueryV1Params {
29+
lq := sysdig.LimitQuery(limit)
30+
p.Limit = &lq
31+
return p
32+
}
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
package tools
2+
3+
import (
4+
"fmt"
5+
"os"
6+
"time"
7+
8+
"github.com/mark3labs/mcp-go/mcp"
9+
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/clock"
10+
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/sysdig"
11+
)
12+
13+
const (
14+
maxIntervalEnvVar = "SYSDIG_MCP_MAX_INTERVAL"
15+
defaultMaxInterval = 168 * time.Hour // 7 days
16+
futureClockSkewGrace = 60 * time.Second
17+
windowedQueryTimeout = "60s"
18+
timeParamStart = "start"
19+
timeParamEnd = "end"
20+
startParamDescription = "Start of the query window as an RFC3339 timestamp (e.g. 2026-04-01T00:00:00Z). When omitted, the tool returns an instant snapshot (current behavior). When provided without end, end defaults to now."
21+
endParamDescription = "End of the query window as an RFC3339 timestamp (e.g. 2026-04-01T01:00:00Z). Requires start. Must not be more than 60s in the future."
22+
)
23+
24+
// TimeWindow is a resolved, validated [Start, End] pair for a historical PromQL query.
25+
// A zero-value TimeWindow means no window was requested — the caller should emit its
26+
// existing instant query and leave GetQueryV1Params.Time nil.
27+
type TimeWindow struct {
28+
Start time.Time
29+
End time.Time
30+
}
31+
32+
// IsZero reports whether no time window was requested.
33+
func (w TimeWindow) IsZero() bool {
34+
return w.Start.IsZero() && w.End.IsZero()
35+
}
36+
37+
// RangeSelector returns the PromQL range-selector literal for this window, e.g. "[3600s]".
38+
// The duration is rounded down to whole seconds so the selector is stable and debuggable.
39+
func (w TimeWindow) RangeSelector() string {
40+
return fmt.Sprintf("[%ds]", int64(w.End.Sub(w.Start).Seconds()))
41+
}
42+
43+
// EvalTime returns a *sysdig.Time suitable for GetQueryV1Params.Time. The value is
44+
// the End instant as unix seconds — the native format accepted by Sysdig's internal
45+
// PromQL stack (confirmed against backend PrometheusFacadeController.java:113).
46+
func (w TimeWindow) EvalTime() (*sysdig.Time, error) {
47+
if w.IsZero() {
48+
return nil, nil
49+
}
50+
var qt sysdig.Time
51+
if err := qt.FromQueryTime1(w.End.Unix()); err != nil {
52+
return nil, fmt.Errorf("building eval time: %w", err)
53+
}
54+
return &qt, nil
55+
}
56+
57+
// WithTimeWindowParams returns a ToolOption that declares the shared "start" and "end"
58+
// RFC3339 parameters on a tool.
59+
func WithTimeWindowParams() mcp.ToolOption {
60+
return func(t *mcp.Tool) {
61+
mcp.WithString(timeParamStart, mcp.Description(startParamDescription))(t)
62+
mcp.WithString(timeParamEnd, mcp.Description(endParamDescription))(t)
63+
}
64+
}
65+
66+
// ParseTimeWindow reads "start" and "end" from the request, validates them, and returns
67+
// the resolved TimeWindow.
68+
//
69+
// - Both absent: returns zero-value TimeWindow, nil error.
70+
// - end without start: error ("end requires start").
71+
// - start without end: end = clk.Now().
72+
// - invalid RFC3339: error naming the bad field.
73+
// - end <= start: error.
74+
// - end > clk.Now() + 60s: error (generous grace for client clock skew).
75+
// - end - start > maxInterval(): error referencing SYSDIG_MCP_MAX_INTERVAL.
76+
func ParseTimeWindow(request mcp.CallToolRequest, clk clock.Clock) (TimeWindow, error) {
77+
startStr := mcp.ParseString(request, timeParamStart, "")
78+
endStr := mcp.ParseString(request, timeParamEnd, "")
79+
80+
if startStr == "" && endStr == "" {
81+
return TimeWindow{}, nil
82+
}
83+
84+
if startStr == "" && endStr != "" {
85+
return TimeWindow{}, fmt.Errorf("end requires start")
86+
}
87+
88+
start, err := time.Parse(time.RFC3339, startStr)
89+
if err != nil {
90+
return TimeWindow{}, fmt.Errorf("invalid start timestamp %q: must be RFC3339 (e.g. 2026-04-01T00:00:00Z)", startStr)
91+
}
92+
93+
now := clk.Now()
94+
95+
var end time.Time
96+
if endStr == "" {
97+
end = now
98+
} else {
99+
end, err = time.Parse(time.RFC3339, endStr)
100+
if err != nil {
101+
return TimeWindow{}, fmt.Errorf("invalid end timestamp %q: must be RFC3339 (e.g. 2026-04-01T01:00:00Z)", endStr)
102+
}
103+
}
104+
105+
if !end.After(start) {
106+
return TimeWindow{}, fmt.Errorf("end (%s) must be after start (%s)", end.Format(time.RFC3339), start.Format(time.RFC3339))
107+
}
108+
109+
if end.After(now.Add(futureClockSkewGrace)) {
110+
return TimeWindow{}, fmt.Errorf("end (%s) must not be more than %s in the future (server time: %s)", end.Format(time.RFC3339), futureClockSkewGrace, now.Format(time.RFC3339))
111+
}
112+
113+
window := end.Sub(start)
114+
if max := maxInterval(); window > max {
115+
return TimeWindow{}, fmt.Errorf("requested window (%s) exceeds maximum allowed (%s); set %s to raise the cap", window, max, maxIntervalEnvVar)
116+
}
117+
118+
return TimeWindow{Start: start, End: end}, nil
119+
}
120+
121+
// maxInterval returns the configured maximum window duration. The SYSDIG_MCP_MAX_INTERVAL
122+
// env var is read on every call (not cached) so tests can override it via t.Setenv without
123+
// bumping into sync.Once-style staleness.
124+
func maxInterval() time.Duration {
125+
if v := os.Getenv(maxIntervalEnvVar); v != "" {
126+
if d, err := time.ParseDuration(v); err == nil && d > 0 {
127+
return d
128+
}
129+
}
130+
return defaultMaxInterval
131+
}

0 commit comments

Comments
 (0)