Skip to content

Commit acaa90a

Browse files
committed
feat: TFO-Agent configure for K8S
1 parent 5c31b74 commit acaa90a

10 files changed

Lines changed: 83 additions & 12 deletions

File tree

configs/tfo-agent-one-for-all.yaml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,18 @@ collectors:
2626
pdb: true
2727
events: true
2828
metrics_api: true
29-
# KSM gaps (new)
29+
# KSM gaps
3030
resource_quotas: true
3131
limit_ranges: true
3232
pod_conditions: true
3333
node_taints: true
3434
workload_generations: true
35+
# Extended metrics — full observability without external tools
36+
apiserver_metrics: true
37+
coredns_metrics: true
38+
coredns_service: "kube-dns.kube-system.svc.cluster.local:9153"
39+
container_extended_metrics: true
40+
pv_io_stats: true
3541

3642
prometheus_scraper:
3743
enabled: true

configs/tfo-agent.default.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,12 @@ collectors:
246246
pod_logs: true # Collect recent log lines from each running container
247247
pod_logs_tail_lines: 100 # Log lines per container per collection cycle
248248
pod_logs_namespaces: [] # Restrict pod log collection to these namespaces (empty = same as namespace filter)
249+
# Extended metrics — TFO Agent replaces Prometheus + kube-state-metrics + cAdvisor
250+
apiserver_metrics: true # Scrape kube-apiserver /metrics endpoint
251+
coredns_metrics: true # Scrape CoreDNS /metrics endpoint
252+
coredns_service: "kube-dns.kube-system.svc.cluster.local:9153"
253+
container_extended_metrics: true # cpu_throttled, memory_working_set, oom (via Kubelet)
254+
pv_io_stats: true # PV usage + IOPS from Kubelet volume stats
249255
# Sync resource state to TFO backend (PostgreSQL entities)
250256
sync_to_backend: true
251257
sync_interval: 60s

configs/tfo-agent.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,12 @@ collectors:
247247
pod_logs: true # Collect recent log lines from each running container
248248
pod_logs_tail_lines: 100 # Log lines per container per collection cycle
249249
pod_logs_namespaces: [] # Restrict pod log collection to these namespaces (empty = same as namespace filter)
250+
# Extended metrics — TFO Agent replaces Prometheus + kube-state-metrics + cAdvisor
251+
apiserver_metrics: true # Scrape kube-apiserver /metrics endpoint
252+
coredns_metrics: true # Scrape CoreDNS /metrics endpoint
253+
coredns_service: "kube-dns.kube-system.svc.cluster.local:9153" # CoreDNS service address
254+
container_extended_metrics: true # cpu_throttled, memory_working_set, oom (via Kubelet /stats/summary + cAdvisor)
255+
pv_io_stats: true # PV usage + IOPS from Kubelet volume stats
250256
# Sync resource state to TFO backend (populates PostgreSQL K8s entities)
251257
sync_to_backend: true
252258
sync_interval: 60s

deploy/helm/telemetryflow-agent/templates/clusterrole.yaml

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,28 @@ rules:
6767
- pods
6868
verbs: ["get", "list"]
6969

70-
# Non-resource URLs (e.g. /metrics, /healthz)
70+
# Pod logs (for pod_logs collector)
71+
- apiGroups: [""]
72+
resources:
73+
- pods/log
74+
verbs: ["get", "list"]
75+
76+
# Policy resources (PDB)
77+
- apiGroups: ["policy"]
78+
resources:
79+
- poddisruptionbudgets
80+
verbs: ["get", "list", "watch"]
81+
82+
# Discovery (EndpointSlices)
83+
- apiGroups: ["discovery.k8s.io"]
84+
resources:
85+
- endpointslices
86+
verbs: ["get", "list", "watch"]
87+
88+
# Non-resource URLs (metrics, healthz, cAdvisor)
7189
- nonResourceURLs:
7290
- /metrics
91+
- /metrics/cadvisor
7392
- /healthz
7493
- /readyz
7594
verbs: ["get"]

deploy/helm/telemetryflow-agent/values-one-for-all.yaml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,11 @@ config:
2727
enabled: true
2828

2929
kubernetes:
30-
enabled: false # overridden to true in kubernetes.config section below
30+
enabled: false # overridden to true in kubernetes.config section below
3131

3232
prometheus_scraper:
3333
enabled: true
34-
scrape_jobs: [] # user fills in their targets
34+
scrape_jobs: [] # user fills in their targets
3535

3636
remote_write_receiver:
3737
enabled: true
@@ -48,3 +48,9 @@ kubernetes:
4848
node_taints: true
4949
workload_generations: true
5050
metrics_api: true
51+
# Extended metrics — full observability without external tools
52+
apiserver_metrics: true
53+
coredns_metrics: true
54+
coredns_service: "kube-dns.kube-system.svc.cluster.local:9153"
55+
container_extended_metrics: true
56+
pv_io_stats: true

deploy/helm/telemetryflow-agent/values.yaml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ environment: production
2525
image:
2626
repository: ghcr.io/telemetryflow/tfo-agent
2727
pullPolicy: IfNotPresent
28-
tag: "" # Defaults to Chart.appVersion
28+
tag: "" # Defaults to Chart.appVersion
2929

3030
imagePullSecrets: []
3131

@@ -298,6 +298,12 @@ kubernetes:
298298
pod_logs_namespaces: []
299299
exclude_namespaces:
300300
- kube-system
301+
# Extended metrics — TFO Agent collects directly (replaces Prometheus stack)
302+
apiserver_metrics: true # Scrape kube-apiserver /metrics
303+
coredns_metrics: true # Scrape CoreDNS /metrics
304+
coredns_service: "kube-dns.kube-system.svc.cluster.local:9153"
305+
container_extended_metrics: true # cpu_throttled, memory_working_set, oom (via Kubelet)
306+
pv_io_stats: true # PV usage + IOPS (via Kubelet volume stats)
301307
# KSM gaps (new — all default false, enabled by oneForAll.enabled)
302308
resource_quotas: false
303309
limit_ranges: false

deploy/kubernetes/configmap.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,16 @@ data:
121121
resource_counts: true
122122
network: true # Kubelet /stats/summary (requires nodes/proxy RBAC)
123123
metrics_api: true # CPU/Memory usage from metrics-server (set false if not installed)
124+
hpa: true
125+
pdb: true
126+
pod_logs: true
127+
pod_logs_tail_lines: 100
128+
# Extended metrics — replaces Prometheus + kube-state-metrics + cAdvisor
129+
apiserver_metrics: true
130+
coredns_metrics: true
131+
coredns_service: "kube-dns.kube-system.svc.cluster.local:9153"
132+
container_extended_metrics: true # cpu_throttled, memory_working_set, oom
133+
pv_io_stats: true # PV usage + IOPS from Kubelet volume stats
124134
sync_to_backend: true
125135
sync_interval: 60s
126136
cluster_name: "" # auto-detected from CLUSTER_NAME env or hostname

deploy/kubernetes/daemonset.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ spec:
140140
- name: TELEMETRYFLOW_NODE_EXPORTER_ENABLED
141141
value: "true"
142142
- name: TELEMETRYFLOW_K8S_ENABLED
143-
value: "false" # K8s state handled by tfo-agent-k8s Deployment (deployment-k8s.yaml)
143+
value: "false" # K8s state handled by tfo-agent-k8s Deployment (deployment-k8s.yaml)
144144

145145
# Prometheus server for liveness/readiness probes
146146
- name: TELEMETRYFLOW_PROMETHEUS_ENABLED
@@ -156,7 +156,7 @@ spec:
156156

157157
# Cluster and environment tags for OTEL resource attributes
158158
- name: CLUSTER_NAME
159-
value: "" # override with your cluster name, or auto-detected from hostname
159+
value: "" # override with your cluster name, or auto-detected from hostname
160160
- name: ENVIRONMENT
161161
value: "production"
162162

@@ -205,15 +205,15 @@ spec:
205205
mountPropagation: HostToContainer
206206

207207
securityContext:
208-
runAsUser: 0 # required to read /proc and /sys for node metrics
208+
runAsUser: 0 # required to read /proc and /sys for node metrics
209209
runAsGroup: 0
210210
readOnlyRootFilesystem: true
211211
allowPrivilegeEscalation: false
212212
capabilities:
213213
drop:
214214
- ALL
215215
add:
216-
- SYS_PTRACE # process inspection for node metrics
216+
- SYS_PTRACE # process inspection for node metrics
217217

218218
volumes:
219219
- name: config

deploy/kubernetes/deployment-k8s.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ metadata:
2929
app.kubernetes.io/component: k8s-collector
3030
app.kubernetes.io/part-of: telemetryflow
3131
spec:
32-
replicas: 1 # exactly 1 — multiple replicas would duplicate cluster state syncs
32+
replicas: 1 # exactly 1 — multiple replicas would duplicate cluster state syncs
3333
selector:
3434
matchLabels:
3535
app.kubernetes.io/name: tfo-agent
@@ -134,7 +134,7 @@ spec:
134134
- name: TELEMETRYFLOW_K8S_ENABLED
135135
value: "true"
136136
- name: TELEMETRYFLOW_NODE_EXPORTER_ENABLED
137-
value: "false" # node metrics handled by tfo-agent DaemonSet
137+
value: "false" # node metrics handled by tfo-agent DaemonSet
138138

139139
# Prometheus server for liveness/readiness probes
140140
- name: TELEMETRYFLOW_PROMETHEUS_ENABLED
@@ -144,7 +144,7 @@ spec:
144144

145145
# Cluster identity — used in auto-registration name and OTEL resource attributes
146146
- name: CLUSTER_NAME
147-
value: "" # override with your cluster name; auto-detected from hostname if empty
147+
value: "" # override with your cluster name; auto-detected from hostname if empty
148148
- name: ENVIRONMENT
149149
value: "production"
150150

deploy/kubernetes/rbac.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,18 @@ rules:
7979
- volumeattachments
8080
verbs: ["get", "list", "watch"]
8181

82+
# Pod logs — for pod_logs collector
83+
- apiGroups: [""]
84+
resources:
85+
- pods/log
86+
verbs: ["get", "list"]
87+
88+
# Policy — PodDisruptionBudgets
89+
- apiGroups: ["policy"]
90+
resources:
91+
- poddisruptionbudgets
92+
verbs: ["get", "list", "watch"]
93+
8294
# Events API (events.k8s.io/v1 replaces core/v1 events in K8s 1.19+)
8395
- apiGroups: ["events.k8s.io"]
8496
resources:

0 commit comments

Comments
 (0)