From c5597cdd6f5d5182276c33dac2ef3f9da9a682a9 Mon Sep 17 00:00:00 2001 From: Jagat Thakkar Date: Fri, 29 May 2026 14:24:58 -0500 Subject: [PATCH 1/2] feat(telemetry): Wire up SeaweedFS metrics Sets MetricsPort on master/volume/filer in the Seaweed CR and replaces the broken `seaweedfs` VMPodScrape with three new ones that actually scrape the per-component metrics ports. Then swaps the leftover MinIO queries in the install + overview dashboards for `SeaweedFS_volumeServer_resource` (capacity) and `SeaweedFS_filer_request_total` (rate). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../dashboards/wandb-field-investigation.json | 2 +- .../wandb-managed-install-performance.json | 20 ++++---- .../dashboards/wandb-telemetry-overview.json | 16 +++---- .../templates/telemetry-scrapes.yaml | 47 +++++++++++++++++-- .../telemetry/infra-metrics-dev.yaml | 40 ++++++++++++++-- .../managed/objectstore/seaweedfs/spec.go | 13 ++++- .../objectstore/seaweedfs/spec_test.go | 15 ++++++ 7 files changed, 124 insertions(+), 29 deletions(-) diff --git a/deploy/telemetry/dashboards/wandb-field-investigation.json b/deploy/telemetry/dashboards/wandb-field-investigation.json index 283d1e02..6e58cc0a 100644 --- a/deploy/telemetry/dashboards/wandb-field-investigation.json +++ b/deploy/telemetry/dashboards/wandb-field-investigation.json @@ -51,7 +51,7 @@ "gridPos": {"h": 6, "w": 24, "x": 0, "y": 0}, "options": { "mode": "markdown", - "content": "# W&B Field Investigation\n\nFirst-look diagnostic for the W&B application stack. Use this dashboard to answer **\"are users seeing problems, and where in the W&B app stack?\"**\n\nFor underlying infrastructure (MySQL, Redis, Kafka, MinIO, ClickHouse, container resources) see the [W&B Managed Install Performance](/d/wandb-managed-install-performance) dashboard.\n\n**Quick lookups from the shell:**\n\n- Install size & version: `kubectl get wandb -n -o yaml`\n- Per-pod images: `kubectl get pods -n -o jsonpath='{range .items[*]}{.metadata.name}{\"\\t\"}{.spec.containers[*].image}{\\\"\\n\\\"}{end}'`\n- Operator logs: `kubectl logs -n wandb-operators -l control-plane=controller-manager --tail=200`\n\nUse the **W&B Namespace** dropdown above to switch installs." + "content": "# W&B Field Investigation\n\nFirst-look diagnostic for the W&B application stack. Use this dashboard to answer **\"are users seeing problems, and where in the W&B app stack?\"**\n\nFor underlying infrastructure (MySQL, Redis, Kafka, SeaweedFS, ClickHouse, container resources) see the [W&B Managed Install Performance](/d/wandb-managed-install-performance) dashboard.\n\n**Quick lookups from the shell:**\n\n- Install size & version: `kubectl get wandb -n -o yaml`\n- Per-pod images: `kubectl get pods -n -o jsonpath='{range .items[*]}{.metadata.name}{\"\\t\"}{.spec.containers[*].image}{\\\"\\n\\\"}{end}'`\n- Operator logs: `kubectl logs -n wandb-operators -l control-plane=controller-manager --tail=200`\n\nUse the **W&B Namespace** dropdown above to switch installs." } }, { diff --git a/deploy/telemetry/dashboards/wandb-managed-install-performance.json b/deploy/telemetry/dashboards/wandb-managed-install-performance.json index acc2764d..95248bba 100644 --- a/deploy/telemetry/dashboards/wandb-managed-install-performance.json +++ b/deploy/telemetry/dashboards/wandb-managed-install-performance.json @@ -50,7 +50,7 @@ "gridPos": {"h": 6, "w": 24, "x": 0, "y": 0}, "options": { "mode": "markdown", - "content": "# W&B Managed Install Performance\n\nInfrastructure health dashboard for a W&B install. Use this dashboard to answer **\"is the underlying infrastructure that runs W&B healthy?\"** — covers all five managed components (MySQL, Redis, Kafka, MinIO, ClickHouse) plus container resource utilization across the install.\n\nFor application-layer signals (W&B API latency, ingest health, slow operations) see the [W&B Field Investigation](/d/wandb-field-investigation) dashboard.\n\n**Quick lookups from the shell:**\n\n- Install size & version: `kubectl get wandb -n -o yaml`\n- Pod restart reasons: `kubectl get pods -n -o jsonpath='{range .items[*]}{.metadata.name}{\"\\t\"}{.status.containerStatuses[*].lastState.terminated.reason}{\\\"\\n\\\"}{end}'`\n- OOM-killed events: `kubectl get events -n --sort-by=.lastTimestamp | grep -i oom`\n\nUse the **W&B Namespace** dropdown above to switch installs." + "content": "# W&B Managed Install Performance\n\nInfrastructure health dashboard for a W&B install. Use this dashboard to answer **\"is the underlying infrastructure that runs W&B healthy?\"** — covers all five managed components (MySQL, Redis, Kafka, SeaweedFS, ClickHouse) plus container resource utilization across the install.\n\nFor application-layer signals (W&B API latency, ingest health, slow operations) see the [W&B Field Investigation](/d/wandb-field-investigation) dashboard.\n\n**Quick lookups from the shell:**\n\n- Install size & version: `kubectl get wandb -n -o yaml`\n- Pod restart reasons: `kubectl get pods -n -o jsonpath='{range .items[*]}{.metadata.name}{\"\\t\"}{.status.containerStatuses[*].lastState.terminated.reason}{\\\"\\n\\\"}{end}'`\n- OOM-killed events: `kubectl get events -n --sort-by=.lastTimestamp | grep -i oom`\n\nUse the **W&B Namespace** dropdown above to switch installs." } }, { @@ -116,7 +116,7 @@ "gridPos": {"h": 10, "w": 8, "x": 16, "y": 17}, "options": { "mode": "markdown", - "content": "### What is this?\n\nTop 15 containers in the selected namespace by CPU usage as a fraction of the configured CPU limit (1.0 = 100% of limit). Filters out system/agent containers. Includes both W&B services and the managed infrastructure components (MySQL, Redis, Kafka, MinIO, ClickHouse).\n\n### Why it matters\n\nA container sustained above 0.8 (80% of limit) will throttle frequently, slowing every request it handles. Above 1.0 means kernel-side throttling is constant.\n\n### What to do\n\nFor the configured limit, run `kubectl describe pod -n ` and look at the Limits section, or check `spec.size` in the WeightsAndBiases CR. For sustained saturation, increase the install size.\n\n*Series will be missing for containers without a configured CPU limit.*" + "content": "### What is this?\n\nTop 15 containers in the selected namespace by CPU usage as a fraction of the configured CPU limit (1.0 = 100% of limit). Filters out system/agent containers. Includes both W&B services and the managed infrastructure components (MySQL, Redis, Kafka, SeaweedFS, ClickHouse).\n\n### Why it matters\n\nA container sustained above 0.8 (80% of limit) will throttle frequently, slowing every request it handles. Above 1.0 means kernel-side throttling is constant.\n\n### What to do\n\nFor the configured limit, run `kubectl describe pod -n ` and look at the Limits section, or check `spec.size` in the WeightsAndBiases CR. For sustained saturation, increase the install size.\n\n*Series will be missing for containers without a configured CPU limit.*" } }, { @@ -459,23 +459,23 @@ }, { "type": "row", - "title": "MinIO", + "title": "Object Store", "gridPos": {"h": 1, "w": 24, "x": 0, "y": 111}, "collapsed": false }, { "type": "timeseries", - "title": "MinIO Capacity Used % & Request Rate", + "title": "Object Store Capacity Used % & Request Rate", "datasource": {"type": "victoriametrics-metrics-datasource", "uid": "${DS_VICTORIAMETRICS}"}, "gridPos": {"h": 10, "w": 16, "x": 0, "y": 112}, "targets": [ { - "expr": "100 * (1 - (sum(minio_cluster_health_capacity_usable_free_bytes) / sum(minio_cluster_health_capacity_usable_total_bytes)))", + "expr": "100 * (1 - (sum(SeaweedFS_volumeServer_resource{type=\"free\"}) / sum(SeaweedFS_volumeServer_resource{type=\"all\"})))", "legendFormat": "capacity used %", "refId": "A" }, { - "expr": "sum(rate(minio_api_requests_total[$__rate_interval]))", + "expr": "sum(rate(SeaweedFS_filer_request_total[$__rate_interval]))", "legendFormat": "requests/sec", "refId": "B" } @@ -502,11 +502,11 @@ }, { "type": "text", - "title": "About: MinIO", + "title": "About: Object Store", "gridPos": {"h": 10, "w": 8, "x": 16, "y": 112}, "options": { "mode": "markdown", - "content": "### What is this?\n\n- **capacity used %** — percentage of usable cluster capacity in use.\n- **requests/sec** — total MinIO API request rate across all operations.\n\n### Why it matters\n\nMinIO is the object store for parquet files, artifact contents, and run media. \n\n- **Capacity above 90%** = urgent action required. New writes will fail soon.\n- **Capacity above 75%** = plan a storage increase.\n- **Sudden drop in request rate** = the API or executor services may be unable to reach MinIO; check service/DNS.\n\n### What to do\n\nFor capacity: increase storage in the WeightsAndBiases CR (`spec.objectStore.managedObjectStore.storageSize`) and re-apply, or migrate older artifacts off-cluster. For request failures: `kubectl logs -n -l weightsandbiases.apps.wandb.com/component=minio --tail=200`." + "content": "### What is this?\n\n- **capacity used %** — percentage of usable volume-server capacity in use, derived from `SeaweedFS_volumeServer_resource{type=\"free\"|\"all\"}`.\n- **requests/sec** — filer HTTP request rate across all operations (`SeaweedFS_filer_request_total`).\n\n### Why it matters\n\nSeaweedFS is the object store for parquet files, artifact contents, and run media.\n\n- **Capacity above 90%** = urgent action required. New writes will fail soon.\n- **Capacity above 75%** = plan a storage increase.\n- **Sudden drop in request rate** = the API or executor services may be unable to reach the filer; check the seaweedfs Service and pod logs.\n\n### What to do\n\nFor capacity: increase storage in the WeightsAndBiases CR (`spec.objectStore.managedObjectStore.storageSize`) and re-apply, or migrate older artifacts off-cluster. For request failures: `kubectl logs -n -l app.kubernetes.io/managed-by=seaweedfs-operator,app.kubernetes.io/component=filer --tail=200`." } }, { @@ -522,7 +522,7 @@ "gridPos": {"h": 10, "w": 16, "x": 0, "y": 123}, "targets": [ { - "expr": "topk(10, histogram_quantile(0.95, sum by (service_name, span_name, le) (rate(traces_spanmetrics_duration_milliseconds_bucket{service_name=~\"gorilla.*\", span_name=~\"(?i)(parquet|filestream|filehandler|runstore|historystore|metadatastore|s3|minio|sql).*\"}[$__rate_interval]))))", + "expr": "topk(10, histogram_quantile(0.95, sum by (service_name, span_name, le) (rate(traces_spanmetrics_duration_milliseconds_bucket{service_name=~\"gorilla.*\", span_name=~\"(?i)(parquet|filestream|filehandler|runstore|historystore|metadatastore|s3|sql).*\"}[$__rate_interval]))))", "legendFormat": "{{service_name}} / {{span_name}}", "refId": "A" } @@ -542,7 +542,7 @@ "gridPos": {"h": 10, "w": 8, "x": 16, "y": 123}, "options": { "mode": "markdown", - "content": "### What is this?\n\np95 latency per storage operation, derived from traces by the OTel `spanmetrics` connector. Covers parquet reads/writes, filestream chunks, MinIO/S3 calls, and SQL statements.\n\n### Why it matters\n\nSlow chart loads, slow file uploads, and stuck artifact downloads almost always trace back to one storage path being slow. This panel surfaces which one without opening individual traces.\n\n### What to do\n\n- Spikes on `ParquetHistoryStore.*` or `HistoryStore.*` → chart loads will feel slow. Check the parquet pod's CPU/memory in **Container Resource Usage** above, and ClickHouse load if installed.\n- Spikes on `FileStreamStore.*` / `FileHandler.*` → ingest stalls. Check filestream container restarts and MinIO capacity below.\n- Spikes on `sql.*` → MySQL is the bottleneck. See **MySQL Errors & Slow Queries** above.\n- For per-request detail, [open Traces in Explore](/explore?panes=%7B%22A%22:%7B%22datasource%22:%22${DS_VICTORIATRACES}%22,%22queries%22:[],%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D%7D&schemaVersion=1&orgId=1) and filter by the offending `service.name` + `span.name` with `duration > 1s`." + "content": "### What is this?\n\np95 latency per storage operation, derived from traces by the OTel `spanmetrics` connector. Covers parquet reads/writes, filestream chunks, S3 calls, and SQL statements.\n\n### Why it matters\n\nSlow chart loads, slow file uploads, and stuck artifact downloads almost always trace back to one storage path being slow. This panel surfaces which one without opening individual traces.\n\n### What to do\n\n- Spikes on `ParquetHistoryStore.*` or `HistoryStore.*` → chart loads will feel slow. Check the parquet pod's CPU/memory in **Container Resource Usage** above, and ClickHouse load if installed.\n- Spikes on `FileStreamStore.*` / `FileHandler.*` → ingest stalls. Check filestream container restarts and object store capacity below.\n- Spikes on `sql.*` → MySQL is the bottleneck. See **MySQL Errors & Slow Queries** above.\n- For per-request detail, [open Traces in Explore](/explore?panes=%7B%22A%22:%7B%22datasource%22:%22${DS_VICTORIATRACES}%22,%22queries%22:[],%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D%7D&schemaVersion=1&orgId=1) and filter by the offending `service.name` + `span.name` with `duration > 1s`." } } ], diff --git a/deploy/telemetry/dashboards/wandb-telemetry-overview.json b/deploy/telemetry/dashboards/wandb-telemetry-overview.json index 2a3f2eca..8ae54eee 100644 --- a/deploy/telemetry/dashboards/wandb-telemetry-overview.json +++ b/deploy/telemetry/dashboards/wandb-telemetry-overview.json @@ -1946,7 +1946,7 @@ }, { "type": "stat", - "title": "MinIO Requests / sec", + "title": "SeaweedFS Requests / sec", "datasource": { "type": "victoriametrics-metrics-datasource", "uid": "${DS_VICTORIAMETRICS}" @@ -1959,7 +1959,7 @@ }, "targets": [ { - "expr": "sum(rate(minio_api_requests_total[$__rate_interval]))", + "expr": "sum(rate(SeaweedFS_filer_request_total[$__rate_interval]))", "refId": "A" } ], @@ -1980,7 +1980,7 @@ }, { "type": "stat", - "title": "MinIO Free Capacity", + "title": "SeaweedFS Free Capacity", "datasource": { "type": "victoriametrics-metrics-datasource", "uid": "${DS_VICTORIAMETRICS}" @@ -1993,7 +1993,7 @@ }, "targets": [ { - "expr": "sum(minio_cluster_health_capacity_usable_free_bytes)", + "expr": "sum(SeaweedFS_volumeServer_resource{type=\"free\"})", "refId": "A" } ], @@ -2067,7 +2067,7 @@ }, { "type": "timeseries", - "title": "MinIO Capacity Used %", + "title": "SeaweedFS Capacity Used %", "datasource": { "type": "victoriametrics-metrics-datasource", "uid": "${DS_VICTORIAMETRICS}" @@ -2080,7 +2080,7 @@ }, "targets": [ { - "expr": "100 * (1 - (sum(minio_cluster_health_capacity_usable_free_bytes) / sum(minio_cluster_health_capacity_usable_total_bytes)))", + "expr": "100 * (1 - (sum(SeaweedFS_volumeServer_resource{type=\"free\"}) / sum(SeaweedFS_volumeServer_resource{type=\"all\"})))", "refId": "A" } ], @@ -2120,7 +2120,7 @@ }, { "type": "timeseries", - "title": "MinIO Request Rate", + "title": "SeaweedFS Request Rate", "datasource": { "type": "victoriametrics-metrics-datasource", "uid": "${DS_VICTORIAMETRICS}" @@ -2133,7 +2133,7 @@ }, "targets": [ { - "expr": "sum(rate(minio_api_requests_total[$__rate_interval]))", + "expr": "sum(rate(SeaweedFS_filer_request_total[$__rate_interval]))", "legendFormat": "requests", "refId": "A" } diff --git a/deploy/telemetry/templates/telemetry-scrapes.yaml b/deploy/telemetry/templates/telemetry-scrapes.yaml index 5a733fb7..81944fd3 100644 --- a/deploy/telemetry/templates/telemetry-scrapes.yaml +++ b/deploy/telemetry/templates/telemetry-scrapes.yaml @@ -266,7 +266,7 @@ spec: apiVersion: operator.victoriametrics.com/v1beta1 kind: VMPodScrape metadata: - name: seaweedfs + name: seaweedfs-master namespace: {{ include "telemetry.namespace" . }} labels: app.kubernetes.io/component: telemetry @@ -274,12 +274,53 @@ metadata: spec: selector: matchLabels: - weightsandbiases.apps.wandb.com/component: seaweedfs + app.kubernetes.io/managed-by: seaweedfs-operator + app.kubernetes.io/component: master namespaceSelector: matchNames: - {{ include "telemetry.namespace" . }} podMetricsEndpoints: - - port: swfs-s3 + - port: master-metrics + path: /metrics +--- +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMPodScrape +metadata: + name: seaweedfs-volume + namespace: {{ include "telemetry.namespace" . }} + labels: + app.kubernetes.io/component: telemetry + app.kubernetes.io/part-of: wandb +spec: + selector: + matchLabels: + app.kubernetes.io/managed-by: seaweedfs-operator + app.kubernetes.io/component: volume + namespaceSelector: + matchNames: + - {{ include "telemetry.namespace" . }} + podMetricsEndpoints: + - port: volume-metrics + path: /metrics +--- +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMPodScrape +metadata: + name: seaweedfs-filer + namespace: {{ include "telemetry.namespace" . }} + labels: + app.kubernetes.io/component: telemetry + app.kubernetes.io/part-of: wandb +spec: + selector: + matchLabels: + app.kubernetes.io/managed-by: seaweedfs-operator + app.kubernetes.io/component: filer + namespaceSelector: + matchNames: + - {{ include "telemetry.namespace" . }} + podMetricsEndpoints: + - port: filer-metrics path: /metrics --- apiVersion: operator.victoriametrics.com/v1beta1 diff --git a/hack/testing-manifests/telemetry/infra-metrics-dev.yaml b/hack/testing-manifests/telemetry/infra-metrics-dev.yaml index ddf8acb6..31e66bd4 100644 --- a/hack/testing-manifests/telemetry/infra-metrics-dev.yaml +++ b/hack/testing-manifests/telemetry/infra-metrics-dev.yaml @@ -44,19 +44,49 @@ spec: - port: "tcp-prometheus" path: /metrics --- -# SeaweedFS -# Scrapes SeaweedFS metrics endpoint +# SeaweedFS Master apiVersion: operator.victoriametrics.com/v1beta1 kind: VMPodScrape metadata: - name: seaweedfs + name: seaweedfs-master namespace: default spec: selector: matchLabels: - weightsandbiases.apps.wandb.com/component: seaweedfs + app.kubernetes.io/managed-by: seaweedfs-operator + app.kubernetes.io/component: master podMetricsEndpoints: - - port: swfs-s3 + - port: master-metrics + path: /metrics +--- +# SeaweedFS Volume +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMPodScrape +metadata: + name: seaweedfs-volume + namespace: default +spec: + selector: + matchLabels: + app.kubernetes.io/managed-by: seaweedfs-operator + app.kubernetes.io/component: volume + podMetricsEndpoints: + - port: volume-metrics + path: /metrics +--- +# SeaweedFS Filer +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMPodScrape +metadata: + name: seaweedfs-filer + namespace: default +spec: + selector: + matchLabels: + app.kubernetes.io/managed-by: seaweedfs-operator + app.kubernetes.io/component: filer + podMetricsEndpoints: + - port: filer-metrics path: /metrics --- # Redis diff --git a/internal/controller/infra/managed/objectstore/seaweedfs/spec.go b/internal/controller/infra/managed/objectstore/seaweedfs/spec.go index b6b2b505..d61c1360 100644 --- a/internal/controller/infra/managed/objectstore/seaweedfs/spec.go +++ b/internal/controller/infra/managed/objectstore/seaweedfs/spec.go @@ -27,6 +27,12 @@ const ( seaweedFilerDataMountPath = "/data/filerldb2" ) +const ( + seaweedMasterMetricsPort int32 = 9091 + seaweedVolumeMetricsPort int32 = 9092 + seaweedFilerMetricsPort int32 = 9093 +) + func seaweedWritableVolumes() []corev1.Volume { return []corev1.Volume{ { @@ -87,6 +93,7 @@ func ToObjectStoreVendorSpec( Replicas: 1, DefaultReplication: &replication, VolumeSizeLimitMB: &volumeSizeLimitMB, + MetricsPort: ptr.To(seaweedMasterMetricsPort), ComponentSpec: seaweedv1.ComponentSpec{ Volumes: seaweedWritableVolumes(), VolumeMounts: seaweedWritableVolumeMounts(), @@ -95,6 +102,7 @@ func ToObjectStoreVendorSpec( Volume: &seaweedv1.VolumeSpec{ Replicas: infraSpec.Replicas, VolumeServerConfig: seaweedv1.VolumeServerConfig{ + MetricsPort: ptr.To(seaweedVolumeMetricsPort), ComponentSpec: seaweedv1.ComponentSpec{ Volumes: seaweedWritableVolumes(), VolumeMounts: seaweedWritableVolumeMounts(), @@ -107,8 +115,9 @@ func ToObjectStoreVendorSpec( }, }, Filer: &seaweedv1.FilerSpec{ - Replicas: 1, - Config: ptr.To("[leveldb2]\nenabled = true\ndir = \"" + seaweedFilerDataMountPath + "\""), + Replicas: 1, + MetricsPort: ptr.To(seaweedFilerMetricsPort), + Config: ptr.To("[leveldb2]\nenabled = true\ndir = \"" + seaweedFilerDataMountPath + "\""), ComponentSpec: seaweedv1.ComponentSpec{ Volumes: seaweedWritableVolumes(), VolumeMounts: seaweedWritableVolumeMounts(), diff --git a/internal/controller/infra/managed/objectstore/seaweedfs/spec_test.go b/internal/controller/infra/managed/objectstore/seaweedfs/spec_test.go index 39878ef4..393366c7 100644 --- a/internal/controller/infra/managed/objectstore/seaweedfs/spec_test.go +++ b/internal/controller/infra/managed/objectstore/seaweedfs/spec_test.go @@ -51,6 +51,21 @@ var _ = Describe("SeaweedFS vendor specs", func() { Expect(seaweed).NotTo(BeNil()) Expect(seaweed.Spec.Volume.ResourceRequirements.Requests[corev1.ResourceCPU]).To(Equal(resource.MustParse("500m"))) }) + + It("sets metrics ports on master, volume, and filer", func() { + seaweed, err := ToObjectStoreVendorSpec(context.Background(), seaweedWandb(), seaweedScheme()) + Expect(err).NotTo(HaveOccurred()) + Expect(seaweed).NotTo(BeNil()) + + Expect(seaweed.Spec.Master.MetricsPort).NotTo(BeNil()) + Expect(*seaweed.Spec.Master.MetricsPort).To(Equal(seaweedMasterMetricsPort)) + + Expect(seaweed.Spec.Volume.MetricsPort).NotTo(BeNil()) + Expect(*seaweed.Spec.Volume.MetricsPort).To(Equal(seaweedVolumeMetricsPort)) + + Expect(seaweed.Spec.Filer.MetricsPort).NotTo(BeNil()) + Expect(*seaweed.Spec.Filer.MetricsPort).To(Equal(seaweedFilerMetricsPort)) + }) }) func seaweedScheme() *runtime.Scheme { From 66ac6f5b1cd4dfc592986a4e529046b7a2626ade Mon Sep 17 00:00:00 2001 From: Jagat Thakkar Date: Fri, 29 May 2026 14:25:18 -0500 Subject: [PATCH 2/2] fix(operator): Install prometheus ServiceMonitor CRD for seaweedfs-operator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The seaweedfs-operator tries to create `ServiceMonitor` resources during reconcile, but our cluster doesn't have that CRD installed (we use VictoriaMetrics, not prometheus-operator), so its reconciler errors after master and volume/filer never come up. Adds the prometheus-operator-crds chart dep with just the ServiceMonitor CRD enabled plus an RBAC binding for the operator's ServiceAccount — the VictoriaMetrics operator auto-converts those ServiceMonitors to VMServiceScrapes. Co-Authored-By: Claude Opus 4.7 (1M context) --- deploy/operator/Chart.lock | 7 ++-- deploy/operator/Chart.yaml | 4 +++ ...eaweedfs-operator-servicemonitor-role.yaml | 32 +++++++++++++++++++ deploy/operator/values.yaml | 26 +++++++++++++++ 4 files changed, 67 insertions(+), 2 deletions(-) create mode 100644 deploy/operator/templates/seaweedfs-operator-servicemonitor-role.yaml diff --git a/deploy/operator/Chart.lock b/deploy/operator/Chart.lock index 3801f143..77c78196 100644 --- a/deploy/operator/Chart.lock +++ b/deploy/operator/Chart.lock @@ -17,6 +17,9 @@ dependencies: - name: seaweedfs-operator repository: https://seaweedfs.github.io/seaweedfs-operator/ version: 0.1.13 +- name: prometheus-operator-crds + repository: https://prometheus-community.github.io/helm-charts + version: 29.0.0 - name: altinity-clickhouse-operator repository: https://helm.altinity.com version: 0.26.3 @@ -29,5 +32,5 @@ dependencies: - name: telemetry repository: file://../telemetry version: 0.1.0 -digest: sha256:51fb84f0e6f0c7ad968fb311d06104c261d684c098972a572d2eb07af2cee0cd -generated: "2026-05-14T14:31:46.62858-05:00" +digest: sha256:4b5de51ac92b6a1c4fe94c7ef9b6d044d8b634852807f2b52b44a42c5e851e0c +generated: "2026-05-29T14:07:08.772292-05:00" diff --git a/deploy/operator/Chart.yaml b/deploy/operator/Chart.yaml index 17dc34d5..a78e9803 100644 --- a/deploy/operator/Chart.yaml +++ b/deploy/operator/Chart.yaml @@ -35,6 +35,10 @@ dependencies: version: 0.1.13 repository: https://seaweedfs.github.io/seaweedfs-operator/ condition: seaweedfs-operator.enabled + - name: prometheus-operator-crds + version: 29.0.0 + repository: https://prometheus-community.github.io/helm-charts + condition: seaweedfs-operator.enabled - name: altinity-clickhouse-operator version: 0.26.3 repository: https://helm.altinity.com diff --git a/deploy/operator/templates/seaweedfs-operator-servicemonitor-role.yaml b/deploy/operator/templates/seaweedfs-operator-servicemonitor-role.yaml new file mode 100644 index 00000000..a6e1b788 --- /dev/null +++ b/deploy/operator/templates/seaweedfs-operator-servicemonitor-role.yaml @@ -0,0 +1,32 @@ +{{- if (index .Values "seaweedfs-operator" "install") }} +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ .Release.Name }}-seaweedfs-operator-servicemonitor +rules: + - apiGroups: + - monitoring.coreos.com + resources: + - servicemonitors + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ .Release.Name }}-seaweedfs-operator-servicemonitor +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ .Release.Name }}-seaweedfs-operator-servicemonitor +subjects: + - kind: ServiceAccount + name: {{ .Release.Name }}-seaweedfs-operator + namespace: {{ .Release.Namespace }} +{{- end }} diff --git a/deploy/operator/values.yaml b/deploy/operator/values.yaml index aca7ec8d..f7a88737 100644 --- a/deploy/operator/values.yaml +++ b/deploy/operator/values.yaml @@ -136,6 +136,32 @@ seaweedfs-operator: webhook: enabled: false +# Only the ServiceMonitor CRD is needed — the seaweedfs-operator creates them +# during reconcile and the VictoriaMetrics Operator auto-converts them to +# VMServiceScrapes. +prometheus-operator-crds: + crds: + alertmanagerconfigs: + enabled: false + alertmanagers: + enabled: false + podmonitors: + enabled: false + probes: + enabled: false + prometheusagents: + enabled: false + prometheuses: + enabled: false + prometheusrules: + enabled: false + scrapeconfigs: + enabled: false + servicemonitors: + enabled: true + thanosrulers: + enabled: false + altinity-clickhouse-operator: crdHook: image: