Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions helm/versus-incident/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,13 @@ data:
{{- end }}
{{- end }}

# Queue listeners (SNS / SQS). The Go config reads these from the
# top-level `queue:` block (cfg.Queue.*), NOT from `alert:` — main.go
# only starts listeners when `queue.enable` is true.
queue:
enable: {{ or .Values.alert.sns.enable .Values.alert.sqs.enable }}
debug_body: {{ .Values.alert.debugBody }}

sns:
enable: {{ .Values.alert.sns.enable }}
{{- if .Values.alert.sns.httpsEndpointSubscriptionPath }}
Expand Down Expand Up @@ -228,6 +235,47 @@ data:
lookback: {{ .Values.agent.lookback | default "5m" }}
new_service_grace: {{ .Values.agent.newServiceGrace | default "30m" }}
sources_path: /app/config/agent_sources.yaml
{{- if .Values.agent.batchMax }}
batch_max: {{ .Values.agent.batchMax }}
{{- end }}
{{- if .Values.agent.signalMaxBytes }}
signal_max_bytes: {{ .Values.agent.signalMaxBytes }}
{{- end }}
{{- with .Values.agent.redaction }}
redaction:
enable: {{ .enable }}
redact_ips: {{ .redactIps }}
{{- with .extraPatterns }}
extra_patterns:
{{- toYaml . | nindent 10 }}
{{- end }}
{{- end }}
{{- with .Values.agent.miner }}
miner:
similarity_threshold: {{ .similarityThreshold }}
tree_depth: {{ .treeDepth }}
max_children: {{ .maxChildren }}
{{- end }}
{{- with .Values.agent.catalog }}
catalog:
persist_interval: {{ .persistInterval }}
auto_promote_after: {{ .autoPromoteAfter }}
spike_multiplier: {{ .spikeMultiplier }}
spike_min_frequency: {{ .spikeMinFrequency }}
spike_min_baseline_count: {{ .spikeMinBaselineCount }}
{{- end }}
{{- with .Values.agent.regex }}
regex:
default_pattern: {{ .defaultPattern | default "" | quote }}
{{- with .rules }}
rules:
{{- toYaml . | nindent 10 }}
{{- end }}
{{- end }}
{{- with .Values.agent.servicePatterns }}
service_patterns:
{{- toYaml . | nindent 8 }}
{{- end }}
ai:
enable: {{ .Values.agent.ai.enable }}
api_key: ${AGENT_AI_API_KEY}
Expand Down
35 changes: 33 additions & 2 deletions helm/versus-incident/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,39 @@ spec:
annotations:
checksum/config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }}
checksum/secret: {{ include (print $.Template.BasePath "/secret.yaml") . | sha256sum }}
{{- with .Values.podAnnotations }}
{{- toYaml . | nindent 8 }}
{{- end }}
spec:
serviceAccountName: {{ include "versus-incident.serviceAccountName" . }}
{{- with .Values.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.podSecurityContext }}
securityContext:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
containers:
- name: {{ .Chart.Name }}
image: "{{ .Values.image.repository }}:v{{ .Values.image.tag | default .Chart.AppVersion }}"
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default (printf "v%s" .Chart.AppVersion) }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
{{- with .Values.securityContext }}
securityContext:
{{- toYaml . | nindent 12 }}
{{- end }}
ports:
- name: http
containerPort: {{ .Values.config.port }}
Expand Down Expand Up @@ -436,7 +463,11 @@ spec:
value: "{{ .Values.aws.region }}"
{{- end }}
{{- end }}


{{- with .Values.extraEnv }}
{{- toYaml . | nindent 12 }}
{{- end }}

volumeMounts:
- name: config-volume
mountPath: /app/config/config.yaml
Expand Down
95 changes: 93 additions & 2 deletions helm/versus-incident/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,17 +47,58 @@ resources:
livenessProbe:
httpGet:
path: /healthz
port: 3000
port: http
initialDelaySeconds: 30
periodSeconds: 10

readinessProbe:
httpGet:
path: /healthz
port: 3000
port: http
initialDelaySeconds: 5
periodSeconds: 5

# Image pull secrets for private registries.
imagePullSecrets: []
# - name: my-registry-secret

# Extra environment variables injected verbatim into the container.
extraEnv: []
# - name: HTTP_PROXY
# value: http://proxy:3128

# Extra pod annotations (merged with the config/secret checksums that drive
# rolling restarts on config change).
podAnnotations: {}

# Pod- and container-level security contexts. Defaults run the static,
# non-root binary as uid/gid 65532 with all capabilities dropped; fsGroup
# makes the /app/data volume writable. Set either block to {} to opt out
# (e.g. an image that must run as root).
podSecurityContext:
fsGroup: 65532
securityContext:
runAsNonRoot: true
runAsUser: 65532
runAsGroup: 65532
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL

# Scheduling controls.
nodeSelector: {}
affinity: {}
tolerations: []

# AWS credentials + region. Only used when alert.sns or alert.sqs is enabled
# (rendered into the Secret and exposed as AWS_ACCESS_KEY_ID /
# AWS_SECRET_ACCESS_KEY / AWS_REGION). Prefer IRSA / instance roles in prod.
aws:
region: ""
accessKeyId: ""
secretAccessKey: ""

config:
name: versus-incident
host: 0.0.0.0
Expand Down Expand Up @@ -121,6 +162,56 @@ agent:
lookback: 5m
newServiceGrace: 30m

# --- Signal processing + detection tuning (rendered into config.yaml). ---
# Without at least regex.defaultPattern set, the agent matches nothing and
# never builds a catalog. Defaults below mirror the in-repo config.yaml.
batchMax: 5000
signalMaxBytes: 65536

# PII/secret redaction applied before clustering.
redaction:
enable: true
redactIps: false
extraPatterns:
- "(?i)password=\\S+"
- "Authorization:\\s*Bearer\\s+\\S+"

# Drain-style log miner (clustering).
miner:
similarityThreshold: 0.4
treeDepth: 4
maxChildren: 100

# Pattern catalog + spike detection.
catalog:
persistInterval: 30s
autoPromoteAfter: 50
spikeMultiplier: 5.0
spikeMinFrequency: 5
spikeMinBaselineCount: 20

# Regex pre-filter: only matching signals reach the miner.
# defaultPattern ".*" trains on every line; "" requires a named rule match.
regex:
defaultPattern: "(?i).*error.*"
rules:
- name: oom-killer
pattern: "Out of memory: Killed process"
- name: panic
pattern: "(?i)panic:"
- name: 5xx-burst
pattern: "HTTP/[0-9.]+\\s+5\\d\\d"

# Service-name extraction regexes (first capture group = service name).
servicePatterns:
- '(?i)\bservice[._-]?name["\s:=]+"?([A-Za-z0-9._-]+)'
- '(?i)\b(?:service|svc|app|component)\s*=\s*"?([A-Za-z0-9._-]+)'
- '(?i)"(?:service|svc|app|component)"\s*:\s*"([A-Za-z0-9._-]+)"'
- '\[[^\]]+\]\s+\[([A-Za-z0-9._-]+)\]'
- '---\s+\[[^\]]*\]\s+\[([A-Za-z0-9._-]+)\]'
- '([A-Za-z0-9._-]+)\[\d+\]:'
- '\[([A-Za-z0-9._-]+)\]'

ai:
# AI analyzer toggle. Required for detect mode to actually call an LLM.
# When false, detect mode classifies patterns but never calls the model
Expand Down