EclipseFdn · skettkepalli · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026 · netomi
diff --git a/charts/openvsx/templates/scaledobject.yaml b/charts/openvsx/templates/scaledobject.yaml
@@ -27,11 +27,12 @@ spec:
               value: {{ .Values.keda.scaleDown.pods | default 1 }}
               periodSeconds: {{ .Values.keda.scaleDown.periodSeconds | default 180 }}
   triggers:
-  # Single signal: avg pod busy/max ratio, smoothed over 5m. Self-normalizing
-  # across jetty.threads.max changes — threshold stays valid at any cap.
-  # Filter on the environment label (set by management.metrics.tags) so this
-  # survives Alloy scrape-config changes that may or may not inject namespace.
+  # Avg pod busy/max ratio — self-normalizing across jetty.threads.max changes.
+  # metricType Value: the ratio query never exceeds 1.0, so AverageValue semantics
+  # (desired = ceil(value/threshold)) cap demand at ceil(1/threshold) pods at any load.
+  # Value semantics scale proportionally: desired = current * value/threshold.
   - type: prometheus
+    metricType: Value
     metadata:
       serverAddress: {{ .Values.keda.prometheusAddress }}
       metricName: jetty_threads_busy_ratio
@@ -40,13 +41,21 @@ spec:
         avg(
           avg_over_time(
             (
-              jetty_threads_busy{environment="{{ .Values.environment }}",application="openvsx-server"}
+              jetty_threads_busy{environment="{{ .Values.environment }}",application="openvsx-server"{{- if .Values.keda.clusterName }},cluster="{{ .Values.keda.clusterName }}"{{- end }}}
               /
-              jetty_threads_config_max{environment="{{ .Values.environment }}",application="openvsx-server"}
-            )[5m:15s]
+              jetty_threads_config_max{environment="{{ .Values.environment }}",application="openvsx-server"{{- if .Values.keda.clusterName }},cluster="{{ .Values.keda.clusterName }}"{{- end }}}
+            )[{{ .Values.keda.smoothingWindow | default "5m" }}:30s]
           )
         )
       authModes: "basic"
     authenticationRef:
       name: {{ .Values.name }}-grafana-cloud-auth
+{{- if .Values.keda.cpuUtilization }}
+  # In-cluster safety net via metrics-server — keeps scale-up functional when the
+  # Grafana Cloud round-trip (app -> Alloy -> Grafana Cloud -> KEDA) is down.
+  - type: cpu
+    metricType: Utilization
+    metadata:
+      value: "{{ .Values.keda.cpuUtilization }}"
+{{- end }}
 {{- end }}
diff --git a/charts/openvsx/values-aws-staging.yaml b/charts/openvsx/values-aws-staging.yaml
@@ -10,6 +10,7 @@ replicaCount: 6
 esReplicaCount: 3
 keda:
   enabled: true
+  clusterName: eks-staging
   minReplicas: 12
   maxReplicas: 18
   prometheusAddress: "https://prometheus-prod-32-prod-ca-east-0.grafana.net/api/prom"

diff --git a/charts/openvsx/values-aws.yaml b/charts/openvsx/values-aws.yaml
@@ -119,21 +119,28 @@ es:
   storage_class: gp3-retain
   storage: 1Gi
 
-# ── KEDA — Jetty threads/connections via Prometheus (monitoring ns, eks/helm.tf) ──
-# Was Tomcat-based; the app runs Jetty (Spring Boot embedded), so the old queries
-# returned empty and autoscaling never fired. maxReplicas raised after the 2026-05-19
-# incident saturated 12 OKD replicas.
+# ── KEDA — Jetty busy-thread ratio (Grafana Cloud) + CPU safety net (metrics-server) ──
+# Threshold is derived from the observed production baseline: well above steady-state
+# noise, low enough to fire before thread saturation. clusterName pins the query to
+# this cluster's series so co-labeled metrics from other clusters can't skew the
+# average. The CPU trigger scales via metrics-server, independent of Grafana Cloud.
 keda:
   enabled: true
   prometheusAddress: https://prometheus-prod-32-prod-ca-east-0.grafana.net/api/prom
-  minReplicas: 6
-  maxReplicas: 20
+  clusterName: eks-production-openvsx
+  minReplicas: 8
+  # 15 caps fleet Hikari pools at 450 of RDS max_connections=500 (30/pod, pinned
+  # open). Raise only after shrinking the pool size.
+  maxReplicas: 15
   pollingInterval: 30
   cooldownPeriod: 600
+  thresholdBusyRatio: "0.15"
+  smoothingWindow: 3m
+  cpuUtilization: 70
   scaleUp:
-    stabilizationWindowSeconds: 180
-    pods: 2
-    periodSeconds: 120
+    stabilizationWindowSeconds: 60
+    pods: 4
+    periodSeconds: 60
   scaleDown:
     stabilizationWindowSeconds: 900
     pods: 1