Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions deploy/operator/Chart.lock
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ dependencies:
- name: seaweedfs-operator
repository: https://seaweedfs.github.io/seaweedfs-operator/
version: 0.1.13
- name: prometheus-operator-crds
repository: https://prometheus-community.github.io/helm-charts
version: 29.0.0
- name: altinity-clickhouse-operator
repository: https://helm.altinity.com
version: 0.26.3
Expand All @@ -29,5 +32,5 @@ dependencies:
- name: telemetry
repository: file://../telemetry
version: 0.1.0
digest: sha256:51fb84f0e6f0c7ad968fb311d06104c261d684c098972a572d2eb07af2cee0cd
generated: "2026-05-14T14:31:46.62858-05:00"
digest: sha256:4b5de51ac92b6a1c4fe94c7ef9b6d044d8b634852807f2b52b44a42c5e851e0c
generated: "2026-05-29T14:07:08.772292-05:00"
4 changes: 4 additions & 0 deletions deploy/operator/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ dependencies:
version: 0.1.13
repository: https://seaweedfs.github.io/seaweedfs-operator/
condition: seaweedfs-operator.enabled
- name: prometheus-operator-crds
version: 29.0.0
repository: https://prometheus-community.github.io/helm-charts
condition: seaweedfs-operator.enabled
- name: altinity-clickhouse-operator
version: 0.26.3
repository: https://helm.altinity.com
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{{- if (index .Values "seaweedfs-operator" "install") }}
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ .Release.Name }}-seaweedfs-operator-servicemonitor
rules:
- apiGroups:
- monitoring.coreos.com
resources:
- servicemonitors
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ .Release.Name }}-seaweedfs-operator-servicemonitor
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: {{ .Release.Name }}-seaweedfs-operator-servicemonitor
subjects:
- kind: ServiceAccount
name: {{ .Release.Name }}-seaweedfs-operator
namespace: {{ .Release.Namespace }}
{{- end }}
26 changes: 26 additions & 0 deletions deploy/operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,32 @@ seaweedfs-operator:
webhook:
enabled: false

# Only the ServiceMonitor CRD is needed — the seaweedfs-operator creates them
# during reconcile and the VictoriaMetrics Operator auto-converts them to
# VMServiceScrapes.
prometheus-operator-crds:
crds:
alertmanagerconfigs:
enabled: false
alertmanagers:
enabled: false
podmonitors:
enabled: false
probes:
enabled: false
prometheusagents:
enabled: false
prometheuses:
enabled: false
prometheusrules:
enabled: false
scrapeconfigs:
enabled: false
servicemonitors:
enabled: true
thanosrulers:
enabled: false

altinity-clickhouse-operator:
crdHook:
image:
Expand Down
2 changes: 1 addition & 1 deletion deploy/telemetry/dashboards/wandb-field-investigation.json
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
"gridPos": {"h": 6, "w": 24, "x": 0, "y": 0},
"options": {
"mode": "markdown",
"content": "# W&B Field Investigation\n\nFirst-look diagnostic for the W&B application stack. Use this dashboard to answer **\"are users seeing problems, and where in the W&B app stack?\"**\n\nFor underlying infrastructure (MySQL, Redis, Kafka, MinIO, ClickHouse, container resources) see the [W&B Managed Install Performance](/d/wandb-managed-install-performance) dashboard.\n\n**Quick lookups from the shell:**\n\n- Install size & version: `kubectl get wandb -n <namespace> -o yaml`\n- Per-pod images: `kubectl get pods -n <namespace> -o jsonpath='{range .items[*]}{.metadata.name}{\"\\t\"}{.spec.containers[*].image}{\\\"\\n\\\"}{end}'`\n- Operator logs: `kubectl logs -n wandb-operators -l control-plane=controller-manager --tail=200`\n\nUse the **W&B Namespace** dropdown above to switch installs."
"content": "# W&B Field Investigation\n\nFirst-look diagnostic for the W&B application stack. Use this dashboard to answer **\"are users seeing problems, and where in the W&B app stack?\"**\n\nFor underlying infrastructure (MySQL, Redis, Kafka, SeaweedFS, ClickHouse, container resources) see the [W&B Managed Install Performance](/d/wandb-managed-install-performance) dashboard.\n\n**Quick lookups from the shell:**\n\n- Install size & version: `kubectl get wandb -n <namespace> -o yaml`\n- Per-pod images: `kubectl get pods -n <namespace> -o jsonpath='{range .items[*]}{.metadata.name}{\"\\t\"}{.spec.containers[*].image}{\\\"\\n\\\"}{end}'`\n- Operator logs: `kubectl logs -n wandb-operators -l control-plane=controller-manager --tail=200`\n\nUse the **W&B Namespace** dropdown above to switch installs."
}
},
{
Expand Down
20 changes: 10 additions & 10 deletions deploy/telemetry/dashboards/wandb-managed-install-performance.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
"gridPos": {"h": 6, "w": 24, "x": 0, "y": 0},
"options": {
"mode": "markdown",
"content": "# W&B Managed Install Performance\n\nInfrastructure health dashboard for a W&B install. Use this dashboard to answer **\"is the underlying infrastructure that runs W&B healthy?\"** — covers all five managed components (MySQL, Redis, Kafka, MinIO, ClickHouse) plus container resource utilization across the install.\n\nFor application-layer signals (W&B API latency, ingest health, slow operations) see the [W&B Field Investigation](/d/wandb-field-investigation) dashboard.\n\n**Quick lookups from the shell:**\n\n- Install size & version: `kubectl get wandb -n <namespace> -o yaml`\n- Pod restart reasons: `kubectl get pods -n <namespace> -o jsonpath='{range .items[*]}{.metadata.name}{\"\\t\"}{.status.containerStatuses[*].lastState.terminated.reason}{\\\"\\n\\\"}{end}'`\n- OOM-killed events: `kubectl get events -n <namespace> --sort-by=.lastTimestamp | grep -i oom`\n\nUse the **W&B Namespace** dropdown above to switch installs."
"content": "# W&B Managed Install Performance\n\nInfrastructure health dashboard for a W&B install. Use this dashboard to answer **\"is the underlying infrastructure that runs W&B healthy?\"** — covers all five managed components (MySQL, Redis, Kafka, SeaweedFS, ClickHouse) plus container resource utilization across the install.\n\nFor application-layer signals (W&B API latency, ingest health, slow operations) see the [W&B Field Investigation](/d/wandb-field-investigation) dashboard.\n\n**Quick lookups from the shell:**\n\n- Install size & version: `kubectl get wandb -n <namespace> -o yaml`\n- Pod restart reasons: `kubectl get pods -n <namespace> -o jsonpath='{range .items[*]}{.metadata.name}{\"\\t\"}{.status.containerStatuses[*].lastState.terminated.reason}{\\\"\\n\\\"}{end}'`\n- OOM-killed events: `kubectl get events -n <namespace> --sort-by=.lastTimestamp | grep -i oom`\n\nUse the **W&B Namespace** dropdown above to switch installs."
}
},
{
Expand Down Expand Up @@ -116,7 +116,7 @@
"gridPos": {"h": 10, "w": 8, "x": 16, "y": 17},
"options": {
"mode": "markdown",
"content": "### What is this?\n\nTop 15 containers in the selected namespace by CPU usage as a fraction of the configured CPU limit (1.0 = 100% of limit). Filters out system/agent containers. Includes both W&B services and the managed infrastructure components (MySQL, Redis, Kafka, MinIO, ClickHouse).\n\n### Why it matters\n\nA container sustained above 0.8 (80% of limit) will throttle frequently, slowing every request it handles. Above 1.0 means kernel-side throttling is constant.\n\n### What to do\n\nFor the configured limit, run `kubectl describe pod -n <namespace> <pod>` and look at the Limits section, or check `spec.size` in the WeightsAndBiases CR. For sustained saturation, increase the install size.\n\n*Series will be missing for containers without a configured CPU limit.*"
"content": "### What is this?\n\nTop 15 containers in the selected namespace by CPU usage as a fraction of the configured CPU limit (1.0 = 100% of limit). Filters out system/agent containers. Includes both W&B services and the managed infrastructure components (MySQL, Redis, Kafka, SeaweedFS, ClickHouse).\n\n### Why it matters\n\nA container sustained above 0.8 (80% of limit) will throttle frequently, slowing every request it handles. Above 1.0 means kernel-side throttling is constant.\n\n### What to do\n\nFor the configured limit, run `kubectl describe pod -n <namespace> <pod>` and look at the Limits section, or check `spec.size` in the WeightsAndBiases CR. For sustained saturation, increase the install size.\n\n*Series will be missing for containers without a configured CPU limit.*"
}
},
{
Expand Down Expand Up @@ -459,23 +459,23 @@
},
{
"type": "row",
"title": "MinIO",
"title": "Object Store",
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 111},
"collapsed": false
},
{
"type": "timeseries",
"title": "MinIO Capacity Used % & Request Rate",
"title": "Object Store Capacity Used % & Request Rate",
"datasource": {"type": "victoriametrics-metrics-datasource", "uid": "${DS_VICTORIAMETRICS}"},
"gridPos": {"h": 10, "w": 16, "x": 0, "y": 112},
"targets": [
{
"expr": "100 * (1 - (sum(minio_cluster_health_capacity_usable_free_bytes) / sum(minio_cluster_health_capacity_usable_total_bytes)))",
"expr": "100 * (1 - (sum(SeaweedFS_volumeServer_resource{type=\"free\"}) / sum(SeaweedFS_volumeServer_resource{type=\"all\"})))",
"legendFormat": "capacity used %",
"refId": "A"
},
{
"expr": "sum(rate(minio_api_requests_total[$__rate_interval]))",
"expr": "sum(rate(SeaweedFS_filer_request_total[$__rate_interval]))",
"legendFormat": "requests/sec",
"refId": "B"
}
Expand All @@ -502,11 +502,11 @@
},
{
"type": "text",
"title": "About: MinIO",
"title": "About: Object Store",
"gridPos": {"h": 10, "w": 8, "x": 16, "y": 112},
"options": {
"mode": "markdown",
"content": "### What is this?\n\n- **capacity used %** — percentage of usable cluster capacity in use.\n- **requests/sec** — total MinIO API request rate across all operations.\n\n### Why it matters\n\nMinIO is the object store for parquet files, artifact contents, and run media. \n\n- **Capacity above 90%** = urgent action required. New writes will fail soon.\n- **Capacity above 75%** = plan a storage increase.\n- **Sudden drop in request rate** = the API or executor services may be unable to reach MinIO; check service/DNS.\n\n### What to do\n\nFor capacity: increase storage in the WeightsAndBiases CR (`spec.objectStore.managedObjectStore.storageSize`) and re-apply, or migrate older artifacts off-cluster. For request failures: `kubectl logs -n <namespace> -l weightsandbiases.apps.wandb.com/component=minio --tail=200`."
"content": "### What is this?\n\n- **capacity used %** — percentage of usable volume-server capacity in use, derived from `SeaweedFS_volumeServer_resource{type=\"free\"|\"all\"}`.\n- **requests/sec** — filer HTTP request rate across all operations (`SeaweedFS_filer_request_total`).\n\n### Why it matters\n\nSeaweedFS is the object store for parquet files, artifact contents, and run media.\n\n- **Capacity above 90%** = urgent action required. New writes will fail soon.\n- **Capacity above 75%** = plan a storage increase.\n- **Sudden drop in request rate** = the API or executor services may be unable to reach the filer; check the seaweedfs Service and pod logs.\n\n### What to do\n\nFor capacity: increase storage in the WeightsAndBiases CR (`spec.objectStore.managedObjectStore.storageSize`) and re-apply, or migrate older artifacts off-cluster. For request failures: `kubectl logs -n <namespace> -l app.kubernetes.io/managed-by=seaweedfs-operator,app.kubernetes.io/component=filer --tail=200`."
}
},
{
Expand All @@ -522,7 +522,7 @@
"gridPos": {"h": 10, "w": 16, "x": 0, "y": 123},
"targets": [
{
"expr": "topk(10, histogram_quantile(0.95, sum by (service_name, span_name, le) (rate(traces_spanmetrics_duration_milliseconds_bucket{service_name=~\"gorilla.*\", span_name=~\"(?i)(parquet|filestream|filehandler|runstore|historystore|metadatastore|s3|minio|sql).*\"}[$__rate_interval]))))",
"expr": "topk(10, histogram_quantile(0.95, sum by (service_name, span_name, le) (rate(traces_spanmetrics_duration_milliseconds_bucket{service_name=~\"gorilla.*\", span_name=~\"(?i)(parquet|filestream|filehandler|runstore|historystore|metadatastore|s3|sql).*\"}[$__rate_interval]))))",
"legendFormat": "{{service_name}} / {{span_name}}",
"refId": "A"
}
Expand All @@ -542,7 +542,7 @@
"gridPos": {"h": 10, "w": 8, "x": 16, "y": 123},
"options": {
"mode": "markdown",
"content": "### What is this?\n\np95 latency per storage operation, derived from traces by the OTel `spanmetrics` connector. Covers parquet reads/writes, filestream chunks, MinIO/S3 calls, and SQL statements.\n\n### Why it matters\n\nSlow chart loads, slow file uploads, and stuck artifact downloads almost always trace back to one storage path being slow. This panel surfaces which one without opening individual traces.\n\n### What to do\n\n- Spikes on `ParquetHistoryStore.*` or `HistoryStore.*` → chart loads will feel slow. Check the parquet pod's CPU/memory in **Container Resource Usage** above, and ClickHouse load if installed.\n- Spikes on `FileStreamStore.*` / `FileHandler.*` → ingest stalls. Check filestream container restarts and MinIO capacity below.\n- Spikes on `sql.*` → MySQL is the bottleneck. See **MySQL Errors & Slow Queries** above.\n- For per-request detail, [open Traces in Explore](/explore?panes=%7B%22A%22:%7B%22datasource%22:%22${DS_VICTORIATRACES}%22,%22queries%22:[],%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D%7D&schemaVersion=1&orgId=1) and filter by the offending `service.name` + `span.name` with `duration > 1s`."
"content": "### What is this?\n\np95 latency per storage operation, derived from traces by the OTel `spanmetrics` connector. Covers parquet reads/writes, filestream chunks, S3 calls, and SQL statements.\n\n### Why it matters\n\nSlow chart loads, slow file uploads, and stuck artifact downloads almost always trace back to one storage path being slow. This panel surfaces which one without opening individual traces.\n\n### What to do\n\n- Spikes on `ParquetHistoryStore.*` or `HistoryStore.*` → chart loads will feel slow. Check the parquet pod's CPU/memory in **Container Resource Usage** above, and ClickHouse load if installed.\n- Spikes on `FileStreamStore.*` / `FileHandler.*` → ingest stalls. Check filestream container restarts and object store capacity below.\n- Spikes on `sql.*` → MySQL is the bottleneck. See **MySQL Errors & Slow Queries** above.\n- For per-request detail, [open Traces in Explore](/explore?panes=%7B%22A%22:%7B%22datasource%22:%22${DS_VICTORIATRACES}%22,%22queries%22:[],%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D%7D&schemaVersion=1&orgId=1) and filter by the offending `service.name` + `span.name` with `duration > 1s`."
}
}
],
Expand Down
16 changes: 8 additions & 8 deletions deploy/telemetry/dashboards/wandb-telemetry-overview.json
Original file line number Diff line number Diff line change
Expand Up @@ -1946,7 +1946,7 @@
},
{
"type": "stat",
"title": "MinIO Requests / sec",
"title": "SeaweedFS Requests / sec",
"datasource": {
"type": "victoriametrics-metrics-datasource",
"uid": "${DS_VICTORIAMETRICS}"
Expand All @@ -1959,7 +1959,7 @@
},
"targets": [
{
"expr": "sum(rate(minio_api_requests_total[$__rate_interval]))",
"expr": "sum(rate(SeaweedFS_filer_request_total[$__rate_interval]))",
"refId": "A"
}
],
Expand All @@ -1980,7 +1980,7 @@
},
{
"type": "stat",
"title": "MinIO Free Capacity",
"title": "SeaweedFS Free Capacity",
"datasource": {
"type": "victoriametrics-metrics-datasource",
"uid": "${DS_VICTORIAMETRICS}"
Expand All @@ -1993,7 +1993,7 @@
},
"targets": [
{
"expr": "sum(minio_cluster_health_capacity_usable_free_bytes)",
"expr": "sum(SeaweedFS_volumeServer_resource{type=\"free\"})",
"refId": "A"
}
],
Expand Down Expand Up @@ -2067,7 +2067,7 @@
},
{
"type": "timeseries",
"title": "MinIO Capacity Used %",
"title": "SeaweedFS Capacity Used %",
"datasource": {
"type": "victoriametrics-metrics-datasource",
"uid": "${DS_VICTORIAMETRICS}"
Expand All @@ -2080,7 +2080,7 @@
},
"targets": [
{
"expr": "100 * (1 - (sum(minio_cluster_health_capacity_usable_free_bytes) / sum(minio_cluster_health_capacity_usable_total_bytes)))",
"expr": "100 * (1 - (sum(SeaweedFS_volumeServer_resource{type=\"free\"}) / sum(SeaweedFS_volumeServer_resource{type=\"all\"})))",
"refId": "A"
}
],
Expand Down Expand Up @@ -2120,7 +2120,7 @@
},
{
"type": "timeseries",
"title": "MinIO Request Rate",
"title": "SeaweedFS Request Rate",
"datasource": {
"type": "victoriametrics-metrics-datasource",
"uid": "${DS_VICTORIAMETRICS}"
Expand All @@ -2133,7 +2133,7 @@
},
"targets": [
{
"expr": "sum(rate(minio_api_requests_total[$__rate_interval]))",
"expr": "sum(rate(SeaweedFS_filer_request_total[$__rate_interval]))",
"legendFormat": "requests",
"refId": "A"
}
Expand Down
47 changes: 44 additions & 3 deletions deploy/telemetry/templates/telemetry-scrapes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -266,20 +266,61 @@ spec:
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMPodScrape
metadata:
name: seaweedfs
name: seaweedfs-master
namespace: {{ include "telemetry.namespace" . }}
labels:
app.kubernetes.io/component: telemetry
app.kubernetes.io/part-of: wandb
spec:
selector:
matchLabels:
weightsandbiases.apps.wandb.com/component: seaweedfs
app.kubernetes.io/managed-by: seaweedfs-operator
app.kubernetes.io/component: master
namespaceSelector:
matchNames:
- {{ include "telemetry.namespace" . }}
podMetricsEndpoints:
- port: swfs-s3
- port: master-metrics
path: /metrics
---
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMPodScrape
metadata:
name: seaweedfs-volume
namespace: {{ include "telemetry.namespace" . }}
labels:
app.kubernetes.io/component: telemetry
app.kubernetes.io/part-of: wandb
spec:
selector:
matchLabels:
app.kubernetes.io/managed-by: seaweedfs-operator
app.kubernetes.io/component: volume
namespaceSelector:
matchNames:
- {{ include "telemetry.namespace" . }}
podMetricsEndpoints:
- port: volume-metrics
path: /metrics
---
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMPodScrape
metadata:
name: seaweedfs-filer
namespace: {{ include "telemetry.namespace" . }}
labels:
app.kubernetes.io/component: telemetry
app.kubernetes.io/part-of: wandb
spec:
selector:
matchLabels:
app.kubernetes.io/managed-by: seaweedfs-operator
app.kubernetes.io/component: filer
namespaceSelector:
matchNames:
- {{ include "telemetry.namespace" . }}
podMetricsEndpoints:
- port: filer-metrics
path: /metrics
---
apiVersion: operator.victoriametrics.com/v1beta1
Expand Down
40 changes: 35 additions & 5 deletions hack/testing-manifests/telemetry/infra-metrics-dev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,19 +44,49 @@ spec:
- port: "tcp-prometheus"
path: /metrics
---
# SeaweedFS
# Scrapes SeaweedFS metrics endpoint
# SeaweedFS Master
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMPodScrape
metadata:
name: seaweedfs
name: seaweedfs-master
namespace: default
spec:
selector:
matchLabels:
weightsandbiases.apps.wandb.com/component: seaweedfs
app.kubernetes.io/managed-by: seaweedfs-operator
app.kubernetes.io/component: master
podMetricsEndpoints:
- port: swfs-s3
- port: master-metrics
path: /metrics
---
# SeaweedFS Volume
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMPodScrape
metadata:
name: seaweedfs-volume
namespace: default
spec:
selector:
matchLabels:
app.kubernetes.io/managed-by: seaweedfs-operator
app.kubernetes.io/component: volume
podMetricsEndpoints:
- port: volume-metrics
path: /metrics
---
# SeaweedFS Filer
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMPodScrape
metadata:
name: seaweedfs-filer
namespace: default
spec:
selector:
matchLabels:
app.kubernetes.io/managed-by: seaweedfs-operator
app.kubernetes.io/component: filer
podMetricsEndpoints:
- port: filer-metrics
path: /metrics
---
# Redis
Expand Down
Loading
Loading