Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 16 additions & 7 deletions charts/openvsx/templates/scaledobject.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,12 @@ spec:
value: {{ .Values.keda.scaleDown.pods | default 1 }}
periodSeconds: {{ .Values.keda.scaleDown.periodSeconds | default 180 }}
triggers:
# Single signal: avg pod busy/max ratio, smoothed over 5m. Self-normalizing
# across jetty.threads.max changes — threshold stays valid at any cap.
# Filter on the environment label (set by management.metrics.tags) so this
# survives Alloy scrape-config changes that may or may not inject namespace.
# Avg pod busy/max ratio — self-normalizing across jetty.threads.max changes.
# metricType Value: the ratio query never exceeds 1.0, so AverageValue semantics
# (desired = ceil(value/threshold)) cap demand at ceil(1/threshold) pods at any load.
# Value semantics scale proportionally: desired = current * value/threshold.
- type: prometheus
metricType: Value
metadata:
serverAddress: {{ .Values.keda.prometheusAddress }}
metricName: jetty_threads_busy_ratio
Expand All @@ -40,13 +41,21 @@ spec:
avg(
avg_over_time(
(
jetty_threads_busy{environment="{{ .Values.environment }}",application="openvsx-server"}
jetty_threads_busy{environment="{{ .Values.environment }}",application="openvsx-server"{{- if .Values.keda.clusterName }},cluster="{{ .Values.keda.clusterName }}"{{- end }}}
/
jetty_threads_config_max{environment="{{ .Values.environment }}",application="openvsx-server"}
)[5m:15s]
jetty_threads_config_max{environment="{{ .Values.environment }}",application="openvsx-server"{{- if .Values.keda.clusterName }},cluster="{{ .Values.keda.clusterName }}"{{- end }}}
)[{{ .Values.keda.smoothingWindow | default "5m" }}:30s]
)
)
authModes: "basic"
authenticationRef:
name: {{ .Values.name }}-grafana-cloud-auth
{{- if .Values.keda.cpuUtilization }}
# In-cluster safety net via metrics-server — keeps scale-up functional when the
# Grafana Cloud round-trip (app -> Alloy -> Grafana Cloud -> KEDA) is down.
- type: cpu
metricType: Utilization
metadata:
value: "{{ .Values.keda.cpuUtilization }}"
{{- end }}
{{- end }}
1 change: 1 addition & 0 deletions charts/openvsx/values-aws-staging.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ replicaCount: 6
esReplicaCount: 3
keda:
enabled: true
clusterName: eks-staging
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for staging the name is eks-staging but for production it is eks-production-openvsx . Either use the suffix for both or none I guess?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We will need to recreate the staging cluster using IaC so will align it in IaC!

minReplicas: 12
maxReplicas: 18
prometheusAddress: "https://prometheus-prod-32-prod-ca-east-0.grafana.net/api/prom"
Expand Down
25 changes: 16 additions & 9 deletions charts/openvsx/values-aws.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -119,21 +119,28 @@ es:
storage_class: gp3-retain
storage: 1Gi

# ── KEDA — Jetty threads/connections via Prometheus (monitoring ns, eks/helm.tf) ──
# Was Tomcat-based; the app runs Jetty (Spring Boot embedded), so the old queries
# returned empty and autoscaling never fired. maxReplicas raised after the 2026-05-19
# incident saturated 12 OKD replicas.
# ── KEDA — Jetty busy-thread ratio (Grafana Cloud) + CPU safety net (metrics-server) ──
# Threshold is derived from the observed production baseline: well above steady-state
# noise, low enough to fire before thread saturation. clusterName pins the query to
# this cluster's series so co-labeled metrics from other clusters can't skew the
# average. The CPU trigger scales via metrics-server, independent of Grafana Cloud.
keda:
enabled: true
prometheusAddress: https://prometheus-prod-32-prod-ca-east-0.grafana.net/api/prom
minReplicas: 6
maxReplicas: 20
clusterName: eks-production-openvsx
minReplicas: 8
# 15 caps fleet Hikari pools at 450 of RDS max_connections=500 (30/pod, pinned
# open). Raise only after shrinking the pool size.
maxReplicas: 15
pollingInterval: 30
cooldownPeriod: 600
thresholdBusyRatio: "0.15"
smoothingWindow: 3m
cpuUtilization: 70
scaleUp:
stabilizationWindowSeconds: 180
pods: 2
periodSeconds: 120
stabilizationWindowSeconds: 60
pods: 4
periodSeconds: 60
scaleDown:
stabilizationWindowSeconds: 900
pods: 1
Expand Down
Loading