From 4f6e3939e9b29af2a005c70fde4ead180ee49465 Mon Sep 17 00:00:00 2001 From: nghiadaulau Date: Sat, 20 Jun 2026 12:58:17 +0700 Subject: [PATCH] fix(helm): render queue block + agent detection config; fix image tag, add aws/securityContext MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Move sns/sqs out of alert: into a top-level queue: block — the Go config reads listeners from cfg.Queue.*, so they were silently ignored. - Render the agent regex/redaction/miner/catalog/service_patterns into config.yaml; without regex.default_pattern the agent matched nothing. - Stop forcing a v prefix on image.tag (broke custom tags like :vlocal); only the default falls back to v. - Add the missing aws: values block (nil-pointer when sns/sqs enabled). - Add pod/container securityContext (non-root, drop ALL caps) and standard knobs (imagePullSecrets/nodeSelector/affinity/tolerations/podAnnotations/ extraEnv); probes use the named http port. Co-Authored-By: Claude Opus 4.8 (1M context) --- helm/versus-incident/templates/configmap.yaml | 48 ++++++++++ .../versus-incident/templates/deployment.yaml | 35 ++++++- helm/versus-incident/values.yaml | 95 ++++++++++++++++++- 3 files changed, 174 insertions(+), 4 deletions(-) diff --git a/helm/versus-incident/templates/configmap.yaml b/helm/versus-incident/templates/configmap.yaml index f38ae2d..5639269 100644 --- a/helm/versus-incident/templates/configmap.yaml +++ b/helm/versus-incident/templates/configmap.yaml @@ -96,6 +96,13 @@ data: {{- end }} {{- end }} + # Queue listeners (SNS / SQS). The Go config reads these from the + # top-level `queue:` block (cfg.Queue.*), NOT from `alert:` — main.go + # only starts listeners when `queue.enable` is true. + queue: + enable: {{ or .Values.alert.sns.enable .Values.alert.sqs.enable }} + debug_body: {{ .Values.alert.debugBody }} + sns: enable: {{ .Values.alert.sns.enable }} {{- if .Values.alert.sns.httpsEndpointSubscriptionPath }} @@ -228,6 +235,47 @@ data: lookback: {{ .Values.agent.lookback | default "5m" }} new_service_grace: {{ .Values.agent.newServiceGrace | default "30m" }} sources_path: /app/config/agent_sources.yaml + {{- if .Values.agent.batchMax }} + batch_max: {{ .Values.agent.batchMax }} + {{- end }} + {{- if .Values.agent.signalMaxBytes }} + signal_max_bytes: {{ .Values.agent.signalMaxBytes }} + {{- end }} + {{- with .Values.agent.redaction }} + redaction: + enable: {{ .enable }} + redact_ips: {{ .redactIps }} + {{- with .extraPatterns }} + extra_patterns: + {{- toYaml . | nindent 10 }} + {{- end }} + {{- end }} + {{- with .Values.agent.miner }} + miner: + similarity_threshold: {{ .similarityThreshold }} + tree_depth: {{ .treeDepth }} + max_children: {{ .maxChildren }} + {{- end }} + {{- with .Values.agent.catalog }} + catalog: + persist_interval: {{ .persistInterval }} + auto_promote_after: {{ .autoPromoteAfter }} + spike_multiplier: {{ .spikeMultiplier }} + spike_min_frequency: {{ .spikeMinFrequency }} + spike_min_baseline_count: {{ .spikeMinBaselineCount }} + {{- end }} + {{- with .Values.agent.regex }} + regex: + default_pattern: {{ .defaultPattern | default "" | quote }} + {{- with .rules }} + rules: + {{- toYaml . | nindent 10 }} + {{- end }} + {{- end }} + {{- with .Values.agent.servicePatterns }} + service_patterns: + {{- toYaml . | nindent 8 }} + {{- end }} ai: enable: {{ .Values.agent.ai.enable }} api_key: ${AGENT_AI_API_KEY} diff --git a/helm/versus-incident/templates/deployment.yaml b/helm/versus-incident/templates/deployment.yaml index 184dd92..966bf85 100644 --- a/helm/versus-incident/templates/deployment.yaml +++ b/helm/versus-incident/templates/deployment.yaml @@ -16,12 +16,39 @@ spec: annotations: checksum/config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }} checksum/secret: {{ include (print $.Template.BasePath "/secret.yaml") . | sha256sum }} + {{- with .Values.podAnnotations }} + {{- toYaml . | nindent 8 }} + {{- end }} spec: serviceAccountName: {{ include "versus-incident.serviceAccountName" . }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.podSecurityContext }} + securityContext: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} containers: - name: {{ .Chart.Name }} - image: "{{ .Values.image.repository }}:v{{ .Values.image.tag | default .Chart.AppVersion }}" + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default (printf "v%s" .Chart.AppVersion) }}" imagePullPolicy: {{ .Values.image.pullPolicy }} + {{- with .Values.securityContext }} + securityContext: + {{- toYaml . | nindent 12 }} + {{- end }} ports: - name: http containerPort: {{ .Values.config.port }} @@ -436,7 +463,11 @@ spec: value: "{{ .Values.aws.region }}" {{- end }} {{- end }} - + + {{- with .Values.extraEnv }} + {{- toYaml . | nindent 12 }} + {{- end }} + volumeMounts: - name: config-volume mountPath: /app/config/config.yaml diff --git a/helm/versus-incident/values.yaml b/helm/versus-incident/values.yaml index 330cc83..8f6191b 100644 --- a/helm/versus-incident/values.yaml +++ b/helm/versus-incident/values.yaml @@ -47,17 +47,58 @@ resources: livenessProbe: httpGet: path: /healthz - port: 3000 + port: http initialDelaySeconds: 30 periodSeconds: 10 readinessProbe: httpGet: path: /healthz - port: 3000 + port: http initialDelaySeconds: 5 periodSeconds: 5 +# Image pull secrets for private registries. +imagePullSecrets: [] + # - name: my-registry-secret + +# Extra environment variables injected verbatim into the container. +extraEnv: [] + # - name: HTTP_PROXY + # value: http://proxy:3128 + +# Extra pod annotations (merged with the config/secret checksums that drive +# rolling restarts on config change). +podAnnotations: {} + +# Pod- and container-level security contexts. Defaults run the static, +# non-root binary as uid/gid 65532 with all capabilities dropped; fsGroup +# makes the /app/data volume writable. Set either block to {} to opt out +# (e.g. an image that must run as root). +podSecurityContext: + fsGroup: 65532 +securityContext: + runAsNonRoot: true + runAsUser: 65532 + runAsGroup: 65532 + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + +# Scheduling controls. +nodeSelector: {} +affinity: {} +tolerations: [] + +# AWS credentials + region. Only used when alert.sns or alert.sqs is enabled +# (rendered into the Secret and exposed as AWS_ACCESS_KEY_ID / +# AWS_SECRET_ACCESS_KEY / AWS_REGION). Prefer IRSA / instance roles in prod. +aws: + region: "" + accessKeyId: "" + secretAccessKey: "" + config: name: versus-incident host: 0.0.0.0 @@ -121,6 +162,56 @@ agent: lookback: 5m newServiceGrace: 30m + # --- Signal processing + detection tuning (rendered into config.yaml). --- + # Without at least regex.defaultPattern set, the agent matches nothing and + # never builds a catalog. Defaults below mirror the in-repo config.yaml. + batchMax: 5000 + signalMaxBytes: 65536 + + # PII/secret redaction applied before clustering. + redaction: + enable: true + redactIps: false + extraPatterns: + - "(?i)password=\\S+" + - "Authorization:\\s*Bearer\\s+\\S+" + + # Drain-style log miner (clustering). + miner: + similarityThreshold: 0.4 + treeDepth: 4 + maxChildren: 100 + + # Pattern catalog + spike detection. + catalog: + persistInterval: 30s + autoPromoteAfter: 50 + spikeMultiplier: 5.0 + spikeMinFrequency: 5 + spikeMinBaselineCount: 20 + + # Regex pre-filter: only matching signals reach the miner. + # defaultPattern ".*" trains on every line; "" requires a named rule match. + regex: + defaultPattern: "(?i).*error.*" + rules: + - name: oom-killer + pattern: "Out of memory: Killed process" + - name: panic + pattern: "(?i)panic:" + - name: 5xx-burst + pattern: "HTTP/[0-9.]+\\s+5\\d\\d" + + # Service-name extraction regexes (first capture group = service name). + servicePatterns: + - '(?i)\bservice[._-]?name["\s:=]+"?([A-Za-z0-9._-]+)' + - '(?i)\b(?:service|svc|app|component)\s*=\s*"?([A-Za-z0-9._-]+)' + - '(?i)"(?:service|svc|app|component)"\s*:\s*"([A-Za-z0-9._-]+)"' + - '\[[^\]]+\]\s+\[([A-Za-z0-9._-]+)\]' + - '---\s+\[[^\]]*\]\s+\[([A-Za-z0-9._-]+)\]' + - '([A-Za-z0-9._-]+)\[\d+\]:' + - '\[([A-Za-z0-9._-]+)\]' + ai: # AI analyzer toggle. Required for detect mode to actually call an LLM. # When false, detect mode classifies patterns but never calls the model