Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ There is also websocket support for stdout. TTY is also supported.

Runtime selection is daemon-only: start the daemon with `druid daemon --runtime docker`, then use `druid` to create, run, and inspect scrolls without passing a runtime. Docker runtime state stays in SQLite under the runtime state directory. Scroll specs and runtime data live together in one runtime root.

Kubernetes runtime support is available with `druid daemon --runtime kubernetes` for in-cluster daemons or out-of-cluster daemons using kubeconfig. It stores daemon scroll state in ConfigMaps, materializes OCI artifacts through `druid worker pull` Jobs, and uses Cilium/Hubble Relay for port traffic presence. See `docs/kubernetes_runtime.md` for kubeconfig, RBAC, PVC, and Hubble setup.
Kubernetes runtime support is available with `druid daemon --runtime kubernetes` for in-cluster daemons or out-of-cluster daemons using kubeconfig. It stores daemon scroll state in ConfigMaps, materializes OCI artifacts through `druid worker pull` Jobs, and uses kubelet pod stats for procedure-level traffic checks. See `docs/kubernetes_runtime.md` for kubeconfig, RBAC, and PVC setup.

## Documentation

Expand Down
3 changes: 0 additions & 3 deletions apps/druid/adapters/cli/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ var k8sUIS3Region string
var k8sUIS3Endpoint string
var k8sUIS3Prefix string
var k8sUIS3Secret string
var hubbleRelayAddr string
var k8sKubeconfig string
var runtimeListen string
var runtimePublicListen string
Expand Down Expand Up @@ -98,7 +97,6 @@ func init() {
DaemonCommand.Flags().StringVar(&k8sUIS3Prefix, "k8s-ui-s3-prefix", "", "Optional S3 key prefix for UI packages (default: DRUID_K8S_UI_S3_PREFIX)")
DaemonCommand.Flags().StringVar(&k8sUIS3Secret, "k8s-ui-s3-credentials-secret", "", "Kubernetes secret with AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY (default: DRUID_K8S_UI_S3_CREDENTIALS_SECRET)")
DaemonCommand.Flags().StringVar(&k8sKubeconfig, "k8s-kubeconfig", "", "Kubernetes kubeconfig path for out-of-cluster runtime access (default: DRUID_K8S_KUBECONFIG, KUBECONFIG, or ~/.kube/config)")
DaemonCommand.Flags().StringVar(&hubbleRelayAddr, "hubble-relay-addr", "", "Hubble Relay gRPC address for Kubernetes port traffic (default: DRUID_HUBBLE_RELAY_ADDR or hubble-relay.kube-system.svc.cluster.local:80)")
}

func runRuntimeDaemon() error {
Expand All @@ -111,7 +109,6 @@ func runRuntimeDaemon() error {
StorageClass: k8sStorageClass,
PullImage: k8sPullImage,
RegistrySecret: k8sRegistrySecret,
HubbleRelayAddr: hubbleRelayAddr,
Kubeconfig: k8sKubeconfig,
UIS3Bucket: k8sUIS3Bucket,
UIS3PublicBaseURL: k8sUIS3PublicBaseURL,
Expand Down
8 changes: 5 additions & 3 deletions config/helm-charts/druid-cli/chart_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ func TestChartRendersDefaultAndCustomValues(t *testing.T) {
`resources: ["secrets"]`,
`resources: ["pods/attach"]`,
`verbs: ["create"]`,
"hubble-relay.kube-system.svc.cluster.local:80",
`resources: ["nodes/proxy"]`,
} {
if !strings.Contains(defaultManifest, want) {
t.Fatalf("default manifest does not contain %q", want)
Expand All @@ -51,7 +51,6 @@ func TestChartRendersDefaultAndCustomValues(t *testing.T) {
"--set", "runtime.pullImage=registry.local/druid-cli:e2e",
"--set", "runtime.helperImage=busybox:1.36",
"--set", "runtime.kubeconfigSecret.name=druid-kubeconfig",
"--set", "hubble.relayAddr=hubble.example:80",
"--set", "networkPolicy.enabled=true",
"--set", "ingress.enabled=true",
"--set", "ingress.hosts[0].host=runtime.example.test",
Expand All @@ -68,7 +67,6 @@ func TestChartRendersDefaultAndCustomValues(t *testing.T) {
"value: \"busybox:1.36\"",
"value: \"true\"",
"value: /etc/druid/kubeconfig",
"hubble.example:80",
"kind: NetworkPolicy",
"kind: Ingress",
"runtime.example.test",
Expand All @@ -80,6 +78,10 @@ func TestChartRendersDefaultAndCustomValues(t *testing.T) {
t.Fatalf("custom manifest does not contain %q", want)
}
}
removedTrafficEnv := "DRUID_" + "HU" + "BBLE_RELAY_ADDR"
if strings.Contains(defaultManifest, removedTrafficEnv) || strings.Contains(customManifest, removedTrafficEnv) {
t.Fatal("chart rendered removed traffic environment")
}
}

func helmTemplate(t *testing.T, args ...string) string {
Expand Down
2 changes: 0 additions & 2 deletions config/helm-charts/druid-cli/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,6 @@ spec:
value: {{ .Values.runtime.registrySecret | quote }}
- name: DRUID_REGISTRY_PLAIN_HTTP
value: {{ ternary "true" "false" .Values.runtime.registryPlainHTTP | quote }}
- name: DRUID_HUBBLE_RELAY_ADDR
value: {{ .Values.hubble.relayAddr | quote }}
{{- if .Values.runtime.kubeconfigSecret.name }}
- name: DRUID_K8S_KUBECONFIG
value: /etc/druid/kubeconfig
Expand Down
3 changes: 3 additions & 0 deletions config/helm-charts/druid-cli/templates/rbac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ rules:
- apiGroups: [""]
resources: ["pods/attach"]
verbs: ["create"]
- apiGroups: [""]
resources: ["nodes/proxy"]
verbs: ["get"]
- apiGroups: ["apps"]
resources: ["statefulsets"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
Expand Down
3 changes: 0 additions & 3 deletions config/helm-charts/druid-cli/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,6 @@ runtime:
name: ""
key: kubeconfig

hubble:
relayAddr: hubble-relay.kube-system.svc.cluster.local:80

auth:
enabled: false
jwksUrl: ""
Expand Down
19 changes: 10 additions & 9 deletions docs_md/kubernetes_keepalive.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,27 @@ sidebar_label: Kubernetes keepAliveTraffic

## Kubernetes keepAliveTraffic

Kubernetes runtimes use Hubble Relay to evaluate `keepAliveTraffic` on expected ports.
Kubernetes runtimes use kubelet pod network stats to evaluate `keepAliveTraffic` on running procedures.

When a running job procedure has an expected port with `keepAliveTraffic`, druid checks for matching Hubble flows over the configured window. If the full window has elapsed and no flow is observed, druid deletes that procedure job and records it as a clean stop. The command run mode is not changed; `restart` and `persistent` scheduling decide what runs next.
When a running job procedure has an expected port with `keepAliveTraffic`, druid samples that procedure pod's RX/TX bytes from `/api/v1/nodes/<node>/proxy/stats/summary`. If the full configured window has elapsed and the RX-byte delta is below every configured threshold, druid deletes that procedure job and records it as a clean stop. The command run mode is not changed; `restart` and `persistent` scheduling decide what runs next.

Coldstarter procedures are not stopped by this rule. For Minecraft restart-mode scrolls, put `keepAliveTraffic` on the real runtime procedure's `main` expected port, not on the coldstarter procedure.

The current Hubble integration tracks flow presence. Use a minimum such as `1b/60m` to mean "at least one observed flow in the last 60 minutes".
Use values such as `10kb/5m` to mean "at least 10 KiB of pod RX traffic in the last 5 minutes". The metric is procedure-level: a single procedure pod can satisfy any of its configured keepalive expected ports.

Required daemon configuration:
Required Kubernetes RBAC:

```
DRUID_HUBBLE_RELAY_ADDR=hubble-relay.kube-system.svc.cluster.local:80
apiGroups: [""]
resources: ["nodes/proxy"]
verbs: ["get"]
```

Validation commands:

```
kubectl -n kube-system get svc hubble-relay
kubectl -n kube-system rollout status deployment/hubble-relay
kubectl -n druid-system get deploy druid-cli -o jsonpath='{.spec.template.spec.containers[0].env[?(@.name=="DRUID_HUBBLE_RELAY_ADDR")].value}{"\n"}'
kubectl auth can-i get nodes/proxy --as=system:serviceaccount:druid-system:druid-cli
kubectl get --raw '/api/v1/nodes/<node>/proxy/stats/summary' | head
```

If Hubble Relay is disabled or unavailable, druid does not stop any procedure for missing traffic and reports `hubble-relay-unavailable` in port status/logs.
After daemon restart, druid fails open until enough pod-stat samples exist to cover the configured window. If pod stats are unavailable or the active pod cannot be resolved, druid does not stop the procedure for missing traffic and reports `kubernetes-pod-stats-unavailable` in port status/logs.
29 changes: 17 additions & 12 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -41,19 +41,17 @@ require (
github.com/valyala/bytebufferpool v1.0.0 // indirect
github.com/valyala/fasthttp v1.65.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
golang.org/x/net v0.47.0 // indirect
golang.org/x/net v0.47.0
golang.org/x/sync v0.18.0 // indirect
golang.org/x/sys v0.38.0 // indirect
golang.org/x/text v0.31.0 // indirect
google.golang.org/protobuf v1.36.6
google.golang.org/protobuf v1.36.6 // indirect
)

require (
github.com/Microsoft/go-winio v0.6.2 // indirect
github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
github.com/aws/aws-sdk-go-v2 v1.41.7 // indirect
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.10 // indirect
github.com/aws/aws-sdk-go-v2/config v1.32.18 // indirect
github.com/aws/aws-sdk-go-v2/credentials v1.19.17 // indirect
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.23 // indirect
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.23 // indirect
Expand All @@ -63,15 +61,13 @@ require (
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.15 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.23 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.23 // indirect
github.com/aws/aws-sdk-go-v2/service/s3 v1.101.0 // indirect
github.com/aws/aws-sdk-go-v2/service/signin v1.0.11 // indirect
github.com/aws/aws-sdk-go-v2/service/sso v1.30.17 // indirect
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.36.0 // indirect
github.com/aws/aws-sdk-go-v2/service/sts v1.42.1 // indirect
github.com/aws/smithy-go v1.25.1 // indirect
github.com/cenkalti/backoff/v4 v4.3.0 // indirect
github.com/containerd/errdefs v0.3.0 // indirect
github.com/containerd/errdefs/pkg v0.3.0 // indirect
github.com/containerd/log v0.1.0 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.6 // indirect
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
github.com/distribution/reference v0.6.0 // indirect
Expand All @@ -90,6 +86,8 @@ require (
github.com/json-iterator/go v1.1.12 // indirect
github.com/moby/docker-image-spec v1.3.1 // indirect
github.com/moby/spdystream v0.5.0 // indirect
github.com/moby/sys/atomicwriter v0.1.0 // indirect
github.com/moby/term v0.5.2 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 // indirect
Expand All @@ -99,11 +97,12 @@ require (
github.com/ncruces/go-strftime v1.0.0 // indirect
github.com/oasdiff/yaml v0.0.0-20250309154309-f31be36b4037 // indirect
github.com/oasdiff/yaml3 v0.0.0-20250309153720-d2182401db90 // indirect
github.com/onsi/gomega v1.36.1 // indirect
github.com/pelletier/go-toml/v2 v2.2.3 // indirect
github.com/perimeterx/marshmallow v1.1.5 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
github.com/robfig/cron/v3 v3.0.1 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/sagikazarmark/locafero v0.7.0 // indirect
github.com/sourcegraph/conc v0.3.0 // indirect
Expand All @@ -112,16 +111,20 @@ require (
go.opentelemetry.io/auto/sdk v1.1.0 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 // indirect
go.opentelemetry.io/otel v1.37.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.32.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.32.0 // indirect
go.opentelemetry.io/otel/metric v1.37.0 // indirect
go.opentelemetry.io/otel/sdk v1.36.0 // indirect
go.opentelemetry.io/otel/trace v1.37.0 // indirect
go.uber.org/atomic v1.11.0 // indirect
go.opentelemetry.io/proto/otlp v1.7.0 // indirect
go.yaml.in/yaml/v2 v2.4.2 // indirect
go.yaml.in/yaml/v3 v3.0.4 // indirect
golang.org/x/exp v0.0.0-20241210194714-1829a127f884 // indirect
golang.org/x/oauth2 v0.30.0 // indirect
golang.org/x/term v0.37.0 // indirect
golang.org/x/time v0.12.0 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20250603155806-513f23925822 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20250721164621-a45f3dfb1074 // indirect
google.golang.org/grpc v1.74.2 // indirect
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gotest.tools/v3 v3.5.2 // indirect
Expand All @@ -139,7 +142,10 @@ require (

require (
github.com/MicahParks/keyfunc v1.9.0
github.com/cilium/cilium v1.18.6
github.com/aws/aws-sdk-go-v2 v1.41.7
github.com/aws/aws-sdk-go-v2/config v1.32.18
github.com/aws/aws-sdk-go-v2/service/s3 v1.101.0
github.com/containerd/errdefs v0.3.0
github.com/docker/docker v28.3.3+incompatible
github.com/docker/go-connections v0.5.0
github.com/getkin/kin-openapi v0.133.0
Expand All @@ -149,7 +155,6 @@ require (
github.com/otiai10/copy v1.14.0
github.com/yuin/gopher-lua v1.1.1
go.uber.org/mock v0.4.0
google.golang.org/grpc v1.74.2
gopkg.in/yaml.v2 v2.4.0
k8s.io/api v0.33.4
k8s.io/apimachinery v0.33.4
Expand Down
Loading
Loading