Skip to content

Commit 0189157

Browse files
authored
Merge pull request #37 from yieon-lyon/feat/webhook-category-severity-filter
feat: add webhook filtering by failure category and severity
2 parents f4b957f + 6bfdfbc commit 0189157

4 files changed

Lines changed: 118 additions & 8 deletions

File tree

containers/devops-agent-operator/README.md

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,38 @@ ContainerCreating, Unschedulable 같은 일시적 상태는 설정된 대기 시
108108
| `PROCESSED_TTL` | 중복 처리 방지 기간 | `1h` |
109109
| `FAILURE_GRACE_PERIOD` | 타임아웃 대기 기간 | `3m` |
110110
| `FAILURE_RECHECK_INTERVAL` | 타임아웃 재확인 간격 | `1m` |
111+
| `WEBHOOK_SKIP_CATEGORIES` | 웹훅/S3/CloudWatch 출력을 건너뛸 감지 레이어 (쉼표 구분) | - |
112+
| `WEBHOOK_MIN_SEVERITY` | 출력을 트리거할 최소 심각도 | - |
113+
114+
#### 출력 필터링
115+
116+
`WEBHOOK_SKIP_CATEGORIES``WEBHOOK_MIN_SEVERITY`는 AND 조건으로 동작합니다. 두 조건을 모두 통과해야 CloudWatch Logs, S3, Webhook 출력이 실행됩니다. 미설정 시 모든 장애에 대해 출력이 실행됩니다.
117+
118+
**WEBHOOK_SKIP_CATEGORIES** — 특정 감지 레이어의 장애를 출력에서 제외합니다.
119+
120+
유효한 값: `ContainerWaiting`, `ContainerTerminated`, `PodPhase`, `PodStatus`, `PodCondition` ([감지 레이어 상세](docs/ARCHITECTURE.md#3단계-장애-감지---detectpodfailure))
121+
122+
```
123+
# Layer 4, 5 장애는 출력하지 않음
124+
WEBHOOK_SKIP_CATEGORIES=PodPhase,PodCondition
125+
```
126+
127+
**WEBHOOK_MIN_SEVERITY** — 설정한 심각도 이상의 장애만 출력합니다.
128+
129+
유효한 값 (낮을수록 심각): `CRITICAL`, `HIGH`, `MEDIUM`, `LOW` ([심각도별 장애 유형](docs/ARCHITECTURE.md#6단계-심각도-결정---determineseverity))
130+
131+
```
132+
# HIGH 이상(CRITICAL, HIGH)만 출력
133+
WEBHOOK_MIN_SEVERITY=HIGH
134+
```
135+
136+
두 옵션을 조합하면 더 세밀하게 제어할 수 있습니다.
137+
138+
```
139+
# PodPhase는 무조건 제외 + 나머지는 HIGH 이상만 출력
140+
WEBHOOK_SKIP_CATEGORIES=PodPhase,PodCondition
141+
WEBHOOK_MIN_SEVERITY=HIGH
142+
```
111143

112144
## IAM 권한
113145

containers/devops-agent-operator/examples/04-configmap.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,13 @@ data:
1919
WEBHOOK_TIMEOUT: "30s"
2020
# Duration to prevent reprocessing the same failure
2121
PROCESSED_TTL: "1h"
22+
# Comma-separated list of failure categories that should NOT trigger webhook calls.
23+
# Valid values: ContainerWaiting, ContainerTerminated, PodPhase, PodStatus, PodCondition
24+
# Empty means all categories trigger webhooks (default behavior).
25+
# Example: "PodPhase,PodCondition"
26+
WEBHOOK_SKIP_CATEGORIES: ""
27+
# Minimum severity level required to trigger a webhook call.
28+
# Valid values: LOW, MEDIUM, HIGH, CRITICAL
29+
# Empty means all severities trigger webhooks (default behavior).
30+
# Example: "HIGH" → only HIGH and CRITICAL failures trigger webhooks
31+
WEBHOOK_MIN_SEVERITY: ""

containers/devops-agent-operator/internal/config/config.go

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,16 @@ type Config struct {
8080
// During the grace period, the operator requeues the pod at this interval to check
8181
// whether the transient state has resolved or the grace period has elapsed.
8282
FailureRecheckInterval time.Duration
83+
84+
// WebhookSkipCategories is a list of failure categories that should not trigger webhook calls.
85+
// Valid values: ContainerWaiting, ContainerTerminated, PodPhase, PodStatus, PodCondition
86+
// Empty means all categories trigger webhooks.
87+
WebhookSkipCategories []string
88+
89+
// WebhookMinSeverity is the minimum severity level required to trigger a webhook call.
90+
// Valid values: LOW, MEDIUM, HIGH, CRITICAL
91+
// Empty means all severities trigger webhooks.
92+
WebhookMinSeverity string
8393
}
8494

8595
// DefaultConfig returns a Config with default values
@@ -183,6 +193,14 @@ func LoadFromEnv() *Config {
183193
}
184194
}
185195

196+
if v := os.Getenv("WEBHOOK_SKIP_CATEGORIES"); v != "" {
197+
cfg.WebhookSkipCategories = splitAndTrim(v, ",")
198+
}
199+
200+
if v := os.Getenv("WEBHOOK_MIN_SEVERITY"); v != "" {
201+
cfg.WebhookMinSeverity = strings.ToUpper(strings.TrimSpace(v))
202+
}
203+
186204
return cfg
187205
}
188206

@@ -221,6 +239,42 @@ func (c *Config) IsNamespaceWatched(namespace string) bool {
221239
return false
222240
}
223241

242+
// severityLevel defines the numeric level of each severity for threshold comparison.
243+
// Lower number = higher severity, consistent with P-level conventions (P0/P1/...).
244+
var severityLevel = map[string]int{
245+
"CRITICAL": 0,
246+
"HIGH": 1,
247+
"MEDIUM": 2,
248+
"LOW": 3,
249+
}
250+
251+
// ShouldSendWebhook returns true if the failure should trigger a webhook call.
252+
// Both conditions must pass (AND logic):
253+
// - The failure category must not be in WebhookSkipCategories
254+
// - The failure severity must meet or exceed WebhookMinSeverity
255+
//
256+
// If neither filter is configured, always returns true (default behavior preserved).
257+
func (c *Config) ShouldSendWebhook(category, severity string) bool {
258+
// Category filter: skip if category is in the skip list
259+
for _, cat := range c.WebhookSkipCategories {
260+
if cat == category {
261+
return false
262+
}
263+
}
264+
265+
// Severity filter: skip if severity is below the minimum threshold
266+
// e.g. WebhookMinSeverity=HIGH → only CRITICAL and HIGH pass
267+
if c.WebhookMinSeverity != "" {
268+
minLevel, minKnown := severityLevel[c.WebhookMinSeverity]
269+
curLevel, curKnown := severityLevel[severity]
270+
if minKnown && curKnown && curLevel > minLevel {
271+
return false
272+
}
273+
}
274+
275+
return true
276+
}
277+
224278
// splitAndTrim splits a string by separator and trims whitespace
225279
func splitAndTrim(s, sep string) []string {
226280
parts := strings.Split(s, sep)

containers/devops-agent-operator/internal/controller/pod_controller.go

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -144,8 +144,20 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
144144
// Build collected data
145145
data := r.buildCollectedData(ctx, &pod, failure)
146146

147+
// Evaluate filter once for all outputs
148+
severity := collector.DetermineSeverity(failure)
149+
shouldSend := r.Config.ShouldSendWebhook(failure.Category, severity)
150+
151+
if !shouldSend {
152+
logger.Info("Outputs skipped by filter",
153+
"pod", req.NamespacedName,
154+
"category", failure.Category,
155+
"severity", severity,
156+
)
157+
}
158+
147159
// Upload to CloudWatch Logs if configured (optional)
148-
if r.CloudWatchClient != nil {
160+
if r.CloudWatchClient != nil && shouldSend {
149161
cwResult, err := r.CloudWatchClient.Upload(ctx, data)
150162
if err != nil {
151163
logger.Error(err, "Failed to upload data to CloudWatch Logs",
@@ -162,7 +174,7 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
162174

163175
// Upload to S3 if configured (optional)
164176
var s3URL string
165-
if r.S3Client != nil {
177+
if r.S3Client != nil && shouldSend {
166178
uploadResult, err := r.S3Client.Upload(ctx, data)
167179
if err != nil {
168180
logger.Error(err, "Failed to upload data to S3",
@@ -178,12 +190,14 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
178190
)
179191
}
180192

181-
// Send to webhook (required)
182-
if err := r.Webhook.Send(ctx, data, s3URL); err != nil {
183-
logger.Error(err, "Failed to send webhook",
184-
"pod", req.NamespacedName,
185-
)
186-
// Continue to mark as processed even if webhook fails
193+
// Send to webhook if not filtered by category or severity
194+
if shouldSend {
195+
if err := r.Webhook.Send(ctx, data, s3URL); err != nil {
196+
logger.Error(err, "Failed to send webhook",
197+
"pod", req.NamespacedName,
198+
)
199+
// Continue to mark as processed even if webhook fails
200+
}
187201
}
188202

189203
// Mark as processed

0 commit comments

Comments
 (0)