Skip to content

Commit 92b7ed5

Browse files
authored
fix(kubernetes): use pod stats for keepalive (#78)
1 parent 64cf9a7 commit 92b7ed5

20 files changed

Lines changed: 644 additions & 342 deletions

File tree

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ There is also websocket support for stdout. TTY is also supported.
6767

6868
Runtime selection is daemon-only: start the daemon with `druid daemon --runtime docker`, then use `druid` to create, run, and inspect scrolls without passing a runtime. Docker runtime state stays in SQLite under the runtime state directory. Scroll specs and runtime data live together in one runtime root.
6969

70-
Kubernetes runtime support is available with `druid daemon --runtime kubernetes` for in-cluster daemons or out-of-cluster daemons using kubeconfig. It stores daemon scroll state in ConfigMaps, materializes OCI artifacts through `druid worker pull` Jobs, and uses Cilium/Hubble Relay for port traffic presence. See `docs/kubernetes_runtime.md` for kubeconfig, RBAC, PVC, and Hubble setup.
70+
Kubernetes runtime support is available with `druid daemon --runtime kubernetes` for in-cluster daemons or out-of-cluster daemons using kubeconfig. It stores daemon scroll state in ConfigMaps, materializes OCI artifacts through `druid worker pull` Jobs, and uses kubelet pod stats for procedure-level traffic checks. See `docs/kubernetes_runtime.md` for kubeconfig, RBAC, and PVC setup.
7171

7272
## Documentation
7373

apps/druid/adapters/cli/daemon.go

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ var k8sUIS3Region string
3535
var k8sUIS3Endpoint string
3636
var k8sUIS3Prefix string
3737
var k8sUIS3Secret string
38-
var hubbleRelayAddr string
3938
var k8sKubeconfig string
4039
var runtimeListen string
4140
var runtimePublicListen string
@@ -98,7 +97,6 @@ func init() {
9897
DaemonCommand.Flags().StringVar(&k8sUIS3Prefix, "k8s-ui-s3-prefix", "", "Optional S3 key prefix for UI packages (default: DRUID_K8S_UI_S3_PREFIX)")
9998
DaemonCommand.Flags().StringVar(&k8sUIS3Secret, "k8s-ui-s3-credentials-secret", "", "Kubernetes secret with AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY (default: DRUID_K8S_UI_S3_CREDENTIALS_SECRET)")
10099
DaemonCommand.Flags().StringVar(&k8sKubeconfig, "k8s-kubeconfig", "", "Kubernetes kubeconfig path for out-of-cluster runtime access (default: DRUID_K8S_KUBECONFIG, KUBECONFIG, or ~/.kube/config)")
101-
DaemonCommand.Flags().StringVar(&hubbleRelayAddr, "hubble-relay-addr", "", "Hubble Relay gRPC address for Kubernetes port traffic (default: DRUID_HUBBLE_RELAY_ADDR or hubble-relay.kube-system.svc.cluster.local:80)")
102100
}
103101

104102
func runRuntimeDaemon() error {
@@ -111,7 +109,6 @@ func runRuntimeDaemon() error {
111109
StorageClass: k8sStorageClass,
112110
PullImage: k8sPullImage,
113111
RegistrySecret: k8sRegistrySecret,
114-
HubbleRelayAddr: hubbleRelayAddr,
115112
Kubeconfig: k8sKubeconfig,
116113
UIS3Bucket: k8sUIS3Bucket,
117114
UIS3PublicBaseURL: k8sUIS3PublicBaseURL,

config/helm-charts/druid-cli/chart_test.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ func TestChartRendersDefaultAndCustomValues(t *testing.T) {
3030
`resources: ["secrets"]`,
3131
`resources: ["pods/attach"]`,
3232
`verbs: ["create"]`,
33-
"hubble-relay.kube-system.svc.cluster.local:80",
33+
`resources: ["nodes/proxy"]`,
3434
} {
3535
if !strings.Contains(defaultManifest, want) {
3636
t.Fatalf("default manifest does not contain %q", want)
@@ -51,7 +51,6 @@ func TestChartRendersDefaultAndCustomValues(t *testing.T) {
5151
"--set", "runtime.pullImage=registry.local/druid-cli:e2e",
5252
"--set", "runtime.helperImage=busybox:1.36",
5353
"--set", "runtime.kubeconfigSecret.name=druid-kubeconfig",
54-
"--set", "hubble.relayAddr=hubble.example:80",
5554
"--set", "networkPolicy.enabled=true",
5655
"--set", "ingress.enabled=true",
5756
"--set", "ingress.hosts[0].host=runtime.example.test",
@@ -68,7 +67,6 @@ func TestChartRendersDefaultAndCustomValues(t *testing.T) {
6867
"value: \"busybox:1.36\"",
6968
"value: \"true\"",
7069
"value: /etc/druid/kubeconfig",
71-
"hubble.example:80",
7270
"kind: NetworkPolicy",
7371
"kind: Ingress",
7472
"runtime.example.test",
@@ -80,6 +78,10 @@ func TestChartRendersDefaultAndCustomValues(t *testing.T) {
8078
t.Fatalf("custom manifest does not contain %q", want)
8179
}
8280
}
81+
removedTrafficEnv := "DRUID_" + "HU" + "BBLE_RELAY_ADDR"
82+
if strings.Contains(defaultManifest, removedTrafficEnv) || strings.Contains(customManifest, removedTrafficEnv) {
83+
t.Fatal("chart rendered removed traffic environment")
84+
}
8385
}
8486

8587
func helmTemplate(t *testing.T, args ...string) string {

config/helm-charts/druid-cli/templates/deployment.yaml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,6 @@ spec:
6868
value: {{ .Values.runtime.registrySecret | quote }}
6969
- name: DRUID_REGISTRY_PLAIN_HTTP
7070
value: {{ ternary "true" "false" .Values.runtime.registryPlainHTTP | quote }}
71-
- name: DRUID_HUBBLE_RELAY_ADDR
72-
value: {{ .Values.hubble.relayAddr | quote }}
7371
{{- if .Values.runtime.kubeconfigSecret.name }}
7472
- name: DRUID_K8S_KUBECONFIG
7573
value: /etc/druid/kubeconfig

config/helm-charts/druid-cli/templates/rbac.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ rules:
2121
- apiGroups: [""]
2222
resources: ["pods/attach"]
2323
verbs: ["create"]
24+
- apiGroups: [""]
25+
resources: ["nodes/proxy"]
26+
verbs: ["get"]
2427
- apiGroups: ["apps"]
2528
resources: ["statefulsets"]
2629
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]

config/helm-charts/druid-cli/values.yaml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,6 @@ runtime:
6363
name: ""
6464
key: kubeconfig
6565

66-
hubble:
67-
relayAddr: hubble-relay.kube-system.svc.cluster.local:80
68-
6966
auth:
7067
enabled: false
7168
jwksUrl: ""

docs_md/kubernetes_keepalive.md

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,26 +5,27 @@ sidebar_label: Kubernetes keepAliveTraffic
55

66
## Kubernetes keepAliveTraffic
77

8-
Kubernetes runtimes use Hubble Relay to evaluate `keepAliveTraffic` on expected ports.
8+
Kubernetes runtimes use kubelet pod network stats to evaluate `keepAliveTraffic` on running procedures.
99

10-
When a running job procedure has an expected port with `keepAliveTraffic`, druid checks for matching Hubble flows over the configured window. If the full window has elapsed and no flow is observed, druid deletes that procedure job and records it as a clean stop. The command run mode is not changed; `restart` and `persistent` scheduling decide what runs next.
10+
When a running job procedure has an expected port with `keepAliveTraffic`, druid samples that procedure pod's RX/TX bytes from `/api/v1/nodes/<node>/proxy/stats/summary`. If the full configured window has elapsed and the RX-byte delta is below every configured threshold, druid deletes that procedure job and records it as a clean stop. The command run mode is not changed; `restart` and `persistent` scheduling decide what runs next.
1111

1212
Coldstarter procedures are not stopped by this rule. For Minecraft restart-mode scrolls, put `keepAliveTraffic` on the real runtime procedure's `main` expected port, not on the coldstarter procedure.
1313

14-
The current Hubble integration tracks flow presence. Use a minimum such as `1b/60m` to mean "at least one observed flow in the last 60 minutes".
14+
Use values such as `10kb/5m` to mean "at least 10 KiB of pod RX traffic in the last 5 minutes". The metric is procedure-level: a single procedure pod can satisfy any of its configured keepalive expected ports.
1515

16-
Required daemon configuration:
16+
Required Kubernetes RBAC:
1717

1818
```
19-
DRUID_HUBBLE_RELAY_ADDR=hubble-relay.kube-system.svc.cluster.local:80
19+
apiGroups: [""]
20+
resources: ["nodes/proxy"]
21+
verbs: ["get"]
2022
```
2123

2224
Validation commands:
2325

2426
```
25-
kubectl -n kube-system get svc hubble-relay
26-
kubectl -n kube-system rollout status deployment/hubble-relay
27-
kubectl -n druid-system get deploy druid-cli -o jsonpath='{.spec.template.spec.containers[0].env[?(@.name=="DRUID_HUBBLE_RELAY_ADDR")].value}{"\n"}'
27+
kubectl auth can-i get nodes/proxy --as=system:serviceaccount:druid-system:druid-cli
28+
kubectl get --raw '/api/v1/nodes/<node>/proxy/stats/summary' | head
2829
```
2930

30-
If Hubble Relay is disabled or unavailable, druid does not stop any procedure for missing traffic and reports `hubble-relay-unavailable` in port status/logs.
31+
After daemon restart, druid fails open until enough pod-stat samples exist to cover the configured window. If pod stats are unavailable or the active pod cannot be resolved, druid does not stop the procedure for missing traffic and reports `kubernetes-pod-stats-unavailable` in port status/logs.

go.mod

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -41,19 +41,17 @@ require (
4141
github.com/valyala/bytebufferpool v1.0.0 // indirect
4242
github.com/valyala/fasthttp v1.65.0 // indirect
4343
go.uber.org/multierr v1.11.0 // indirect
44-
golang.org/x/net v0.47.0 // indirect
44+
golang.org/x/net v0.47.0
4545
golang.org/x/sync v0.18.0 // indirect
4646
golang.org/x/sys v0.38.0 // indirect
4747
golang.org/x/text v0.31.0 // indirect
48-
google.golang.org/protobuf v1.36.6
48+
google.golang.org/protobuf v1.36.6 // indirect
4949
)
5050

5151
require (
5252
github.com/Microsoft/go-winio v0.6.2 // indirect
5353
github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
54-
github.com/aws/aws-sdk-go-v2 v1.41.7 // indirect
5554
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.10 // indirect
56-
github.com/aws/aws-sdk-go-v2/config v1.32.18 // indirect
5755
github.com/aws/aws-sdk-go-v2/credentials v1.19.17 // indirect
5856
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.23 // indirect
5957
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.23 // indirect
@@ -63,15 +61,13 @@ require (
6361
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.15 // indirect
6462
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.23 // indirect
6563
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.23 // indirect
66-
github.com/aws/aws-sdk-go-v2/service/s3 v1.101.0 // indirect
6764
github.com/aws/aws-sdk-go-v2/service/signin v1.0.11 // indirect
6865
github.com/aws/aws-sdk-go-v2/service/sso v1.30.17 // indirect
6966
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.36.0 // indirect
7067
github.com/aws/aws-sdk-go-v2/service/sts v1.42.1 // indirect
7168
github.com/aws/smithy-go v1.25.1 // indirect
72-
github.com/cenkalti/backoff/v4 v4.3.0 // indirect
73-
github.com/containerd/errdefs v0.3.0 // indirect
7469
github.com/containerd/errdefs/pkg v0.3.0 // indirect
70+
github.com/containerd/log v0.1.0 // indirect
7571
github.com/cpuguy83/go-md2man/v2 v2.0.6 // indirect
7672
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
7773
github.com/distribution/reference v0.6.0 // indirect
@@ -90,6 +86,8 @@ require (
9086
github.com/json-iterator/go v1.1.12 // indirect
9187
github.com/moby/docker-image-spec v1.3.1 // indirect
9288
github.com/moby/spdystream v0.5.0 // indirect
89+
github.com/moby/sys/atomicwriter v0.1.0 // indirect
90+
github.com/moby/term v0.5.2 // indirect
9391
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
9492
github.com/modern-go/reflect2 v1.0.2 // indirect
9593
github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 // indirect
@@ -99,11 +97,12 @@ require (
9997
github.com/ncruces/go-strftime v1.0.0 // indirect
10098
github.com/oasdiff/yaml v0.0.0-20250309154309-f31be36b4037 // indirect
10199
github.com/oasdiff/yaml3 v0.0.0-20250309153720-d2182401db90 // indirect
100+
github.com/onsi/gomega v1.36.1 // indirect
102101
github.com/pelletier/go-toml/v2 v2.2.3 // indirect
103102
github.com/perimeterx/marshmallow v1.1.5 // indirect
104103
github.com/pkg/errors v0.9.1 // indirect
104+
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
105105
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
106-
github.com/robfig/cron/v3 v3.0.1 // indirect
107106
github.com/russross/blackfriday/v2 v2.1.0 // indirect
108107
github.com/sagikazarmark/locafero v0.7.0 // indirect
109108
github.com/sourcegraph/conc v0.3.0 // indirect
@@ -112,16 +111,20 @@ require (
112111
go.opentelemetry.io/auto/sdk v1.1.0 // indirect
113112
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 // indirect
114113
go.opentelemetry.io/otel v1.37.0 // indirect
115-
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.32.0 // indirect
114+
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.32.0 // indirect
116115
go.opentelemetry.io/otel/metric v1.37.0 // indirect
116+
go.opentelemetry.io/otel/sdk v1.36.0 // indirect
117117
go.opentelemetry.io/otel/trace v1.37.0 // indirect
118-
go.uber.org/atomic v1.11.0 // indirect
118+
go.opentelemetry.io/proto/otlp v1.7.0 // indirect
119119
go.yaml.in/yaml/v2 v2.4.2 // indirect
120+
go.yaml.in/yaml/v3 v3.0.4 // indirect
120121
golang.org/x/exp v0.0.0-20241210194714-1829a127f884 // indirect
121122
golang.org/x/oauth2 v0.30.0 // indirect
122123
golang.org/x/term v0.37.0 // indirect
123124
golang.org/x/time v0.12.0 // indirect
125+
google.golang.org/genproto/googleapis/api v0.0.0-20250603155806-513f23925822 // indirect
124126
google.golang.org/genproto/googleapis/rpc v0.0.0-20250721164621-a45f3dfb1074 // indirect
127+
google.golang.org/grpc v1.74.2 // indirect
125128
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
126129
gopkg.in/inf.v0 v0.9.1 // indirect
127130
gotest.tools/v3 v3.5.2 // indirect
@@ -139,7 +142,10 @@ require (
139142

140143
require (
141144
github.com/MicahParks/keyfunc v1.9.0
142-
github.com/cilium/cilium v1.18.6
145+
github.com/aws/aws-sdk-go-v2 v1.41.7
146+
github.com/aws/aws-sdk-go-v2/config v1.32.18
147+
github.com/aws/aws-sdk-go-v2/service/s3 v1.101.0
148+
github.com/containerd/errdefs v0.3.0
143149
github.com/docker/docker v28.3.3+incompatible
144150
github.com/docker/go-connections v0.5.0
145151
github.com/getkin/kin-openapi v0.133.0
@@ -149,7 +155,6 @@ require (
149155
github.com/otiai10/copy v1.14.0
150156
github.com/yuin/gopher-lua v1.1.1
151157
go.uber.org/mock v0.4.0
152-
google.golang.org/grpc v1.74.2
153158
gopkg.in/yaml.v2 v2.4.0
154159
k8s.io/api v0.33.4
155160
k8s.io/apimachinery v0.33.4

0 commit comments

Comments
 (0)