Skip to content

Commit b79d601

Browse files
acailicclaude
andauthored
feat: alert lifecycle management, configurable policies, and query performance indexes (#135)
* fix: handle string/date type mismatch in daily cost breakdown SQLite func.date() returns strings, not date objects. Use strftime and str() consistently to avoid AttributeError on .isoformat(). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * feat: add alert lifecycle management, alert policies, and query performance - Alert lifecycle: full state machine (firing → acknowledged → resolved/suppressed) with history tracking, bulk operations, and SSE streaming via trace_routes - Alert policies: configurable per-agent thresholds for alert_type, severity, and threshold_value; full CRUD via /api/alert-policies with tenant scoping - Storage: AlertPolicyModel, AlertPolicyRepository, cache layer, 3 migrations (006 alert lifecycle columns, 007 alert policies table, 008 performance indexes) - Frontend: AlertPolicy/AlertLifecycle types, API client methods, useAlerts and useAlertSummary hooks - Tests: 365+ lines covering lifecycle state transitions, policy CRUD, and index/query performance assertions Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * feat: add AlertDashboardPanel component and alert UI styles Adds the alert dashboard panel with lifecycle controls and policy management UI, wired into AnalyticsTab. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * fix: clean up unused imports in AlertDashboardPanel Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: address PR review feedback for Phase 4 intelligence features - Add prefix-based cache invalidation to QueryCache - Fix cache key to include limit parameter in trending queries - Expand cache invalidation to cover all aggregate cache families - Fix frontend to send 'note' instead of 'resolution_note' in status updates - Fix trending API response unwrapping (unwrap .trending from envelope) - Fix bulk status return type to match backend ({ updated, status }) - Change ManagedAlert.severity from RiskLevel to number, add severityLabel helper - Use numeric severity values in filter dropdown (matching backend float param) - Add accessibility (role, tabIndex, onKeyDown) to alert rows - Make AlertDeriver.get_threshold async-safe with coroutine detection - Use UNSET sentinel in policy_repo.update_policy for nullable field support - Rename ix_sessions_created_at to ix_sessions_started_at in migration 008 - Replace real sleeps with monkeypatched time in cache tests - Fix hard-coded path in capture_search.py (use Path(__file__).parent) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: acailic <acailic@users.noreply.github.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent de8ab15 commit b79d601

28 files changed

Lines changed: 3757 additions & 9 deletions

api/main.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from api.entity_routes import router as entity_router
2525
from api.exceptions import AppError
2626
from api.middleware import ContentTypeValidationMiddleware, LoggingMiddleware, RequestIDMiddleware
27+
from api.policy_routes import router as policy_router
2728
from api.replay_routes import router as replay_router
2829
from api.search_routes import router as search_router
2930
from api.session_routes import router as session_router
@@ -133,6 +134,7 @@ async def global_exception_handler(request: Request, exc: Exception) -> JSONResp
133134
app.include_router(cost_router)
134135
app.include_router(search_router)
135136
app.include_router(entity_router)
137+
app.include_router(policy_router)
136138
app.include_router(system_router)
137139
app.include_router(ui_router)
138140

api/policy_routes.py

Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
"""Alert policy API routes for configurable alert thresholds."""
2+
3+
from __future__ import annotations
4+
5+
from typing import Any
6+
7+
from fastapi import APIRouter, Depends, Query
8+
from sqlalchemy.ext.asyncio import AsyncSession
9+
10+
from api.dependencies import get_db_session, get_tenant_id
11+
from api.exceptions import NotFoundError
12+
from api.schemas import AlertPolicyCreate, AlertPolicyListResponse, AlertPolicySchema, AlertPolicyUpdate
13+
from storage import AlertPolicyRepository
14+
15+
router = APIRouter(tags=["alert-policies"])
16+
17+
18+
async def get_policy_repository(
19+
session: AsyncSession = Depends(get_db_session),
20+
tenant_id: str = Depends(get_tenant_id),
21+
) -> AlertPolicyRepository:
22+
"""Get an alert policy repository scoped to the current tenant."""
23+
return AlertPolicyRepository(session, tenant_id=tenant_id)
24+
25+
26+
@router.get("/api/alert-policies", response_model=AlertPolicyListResponse)
27+
async def list_policies(
28+
agent_name: str | None = Query(default=None),
29+
limit: int = Query(default=100, ge=1, le=1000),
30+
repo: AlertPolicyRepository = Depends(get_policy_repository),
31+
) -> AlertPolicyListResponse:
32+
"""List all alert policies, optionally filtered by agent_name.
33+
34+
Args:
35+
agent_name: Optional agent name filter. If provided, returns both
36+
agent-specific and global policies for this agent.
37+
limit: Maximum number of policies to return
38+
repo: AlertPolicyRepository instance
39+
40+
Returns:
41+
List of alert policies
42+
"""
43+
policies = await repo.list_policies(agent_name=agent_name, limit=limit)
44+
45+
return AlertPolicyListResponse(
46+
policies=[
47+
AlertPolicySchema(
48+
id=policy.id,
49+
agent_name=policy.agent_name,
50+
alert_type=policy.alert_type,
51+
threshold_value=policy.threshold_value,
52+
severity_threshold=policy.severity_threshold,
53+
enabled=policy.enabled,
54+
created_at=policy.created_at,
55+
updated_at=policy.updated_at,
56+
)
57+
for policy in policies
58+
],
59+
total=len(policies),
60+
)
61+
62+
63+
@router.post("/api/alert-policies", response_model=AlertPolicySchema)
64+
async def create_policy(
65+
data: AlertPolicyCreate,
66+
repo: AlertPolicyRepository = Depends(get_policy_repository),
67+
) -> AlertPolicySchema:
68+
"""Create a new alert policy.
69+
70+
Args:
71+
data: Policy creation data
72+
repo: AlertPolicyRepository instance
73+
74+
Returns:
75+
Created alert policy
76+
"""
77+
policy = await repo.create_policy(
78+
agent_name=data.agent_name,
79+
alert_type=data.alert_type,
80+
threshold_value=data.threshold_value,
81+
severity_threshold=data.severity_threshold,
82+
enabled=data.enabled,
83+
)
84+
# Commit to persist the policy
85+
await repo.session.commit()
86+
await repo.session.refresh(policy)
87+
88+
return AlertPolicySchema(
89+
id=policy.id,
90+
agent_name=policy.agent_name,
91+
alert_type=policy.alert_type,
92+
threshold_value=policy.threshold_value,
93+
severity_threshold=policy.severity_threshold,
94+
enabled=policy.enabled,
95+
created_at=policy.created_at,
96+
updated_at=policy.updated_at,
97+
)
98+
99+
100+
@router.get("/api/alert-policies/{policy_id}", response_model=AlertPolicySchema)
101+
async def get_policy(
102+
policy_id: str,
103+
repo: AlertPolicyRepository = Depends(get_policy_repository),
104+
) -> AlertPolicySchema:
105+
"""Get a single alert policy by ID.
106+
107+
Args:
108+
policy_id: Unique identifier of the policy
109+
repo: AlertPolicyRepository instance
110+
111+
Returns:
112+
Alert policy details
113+
114+
Raises:
115+
NotFoundError: if policy not found
116+
"""
117+
policy = await repo.get_policy(policy_id)
118+
if not policy:
119+
raise NotFoundError(f"Policy {policy_id} not found")
120+
121+
return AlertPolicySchema(
122+
id=policy.id,
123+
agent_name=policy.agent_name,
124+
alert_type=policy.alert_type,
125+
threshold_value=policy.threshold_value,
126+
severity_threshold=policy.severity_threshold,
127+
enabled=policy.enabled,
128+
created_at=policy.created_at,
129+
updated_at=policy.updated_at,
130+
)
131+
132+
133+
@router.put("/api/alert-policies/{policy_id}", response_model=AlertPolicySchema)
134+
async def update_policy(
135+
policy_id: str,
136+
data: AlertPolicyUpdate,
137+
repo: AlertPolicyRepository = Depends(get_policy_repository),
138+
) -> AlertPolicySchema:
139+
"""Update an existing alert policy.
140+
141+
Args:
142+
policy_id: Unique identifier of the policy to update
143+
data: Policy update data
144+
repo: AlertPolicyRepository instance
145+
146+
Returns:
147+
Updated alert policy
148+
149+
Raises:
150+
NotFoundError: if policy not found
151+
"""
152+
policy = await repo.update_policy(
153+
policy_id=policy_id,
154+
agent_name=data.agent_name,
155+
alert_type=data.alert_type,
156+
threshold_value=data.threshold_value,
157+
severity_threshold=data.severity_threshold,
158+
enabled=data.enabled,
159+
)
160+
161+
if not policy:
162+
raise NotFoundError(f"Policy {policy_id} not found")
163+
164+
# Commit to persist changes
165+
await repo.session.commit()
166+
await repo.session.refresh(policy)
167+
168+
return AlertPolicySchema(
169+
id=policy.id,
170+
agent_name=policy.agent_name,
171+
alert_type=policy.alert_type,
172+
threshold_value=policy.threshold_value,
173+
severity_threshold=policy.severity_threshold,
174+
enabled=policy.enabled,
175+
created_at=policy.created_at,
176+
updated_at=policy.updated_at,
177+
)
178+
179+
180+
@router.delete("/api/alert-policies/{policy_id}")
181+
async def delete_policy(
182+
policy_id: str,
183+
repo: AlertPolicyRepository = Depends(get_policy_repository),
184+
) -> dict[str, Any]:
185+
"""Delete an alert policy by ID.
186+
187+
Args:
188+
policy_id: Unique identifier of the policy to delete
189+
repo: AlertPolicyRepository instance
190+
191+
Returns:
192+
Deletion confirmation
193+
194+
Raises:
195+
NotFoundError: if policy not found
196+
"""
197+
deleted = await repo.delete_policy(policy_id)
198+
199+
if not deleted:
200+
raise NotFoundError(f"Policy {policy_id} not found")
201+
202+
# Commit to persist deletion
203+
await repo.session.commit()
204+
205+
return {"deleted": True, "policy_id": policy_id}

api/schemas.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,11 @@ class AnomalyAlertSchema(BaseModel):
300300
detection_source: str
301301
detection_config: dict[str, Any]
302302
created_at: datetime
303+
status: str | None = None
304+
acknowledged_at: datetime | None = None
305+
resolved_at: datetime | None = None
306+
dismissed_at: datetime | None = None
307+
resolution_note: str | None = None
303308

304309

305310
class AnomalyAlertListResponse(BaseModel):
@@ -310,6 +315,77 @@ class AnomalyAlertListResponse(BaseModel):
310315
total: int
311316

312317

318+
# ------------------------------------------------------------------
319+
# Alert Lifecycle Schemas
320+
# ------------------------------------------------------------------
321+
322+
323+
class AlertStatusUpdate(BaseModel):
324+
"""Request schema for updating a single alert's status."""
325+
326+
status: str = Field(min_length=1, max_length=32)
327+
note: str | None = Field(default=None, max_length=2000)
328+
329+
330+
class AlertBulkUpdate(BaseModel):
331+
"""Request schema for bulk updating alert statuses."""
332+
333+
alert_ids: list[str] = Field(min_length=1)
334+
status: str = Field(min_length=1, max_length=32)
335+
336+
337+
class AlertFilters(BaseModel):
338+
"""Query parameters for filtering alerts."""
339+
340+
agent_name: str | None = None
341+
severity: float | None = Field(default=None, ge=0.0, le=1.0)
342+
alert_type: str | None = None
343+
status: str | None = None
344+
from_date: datetime | None = None
345+
to_date: datetime | None = None
346+
limit: int = Field(default=50, ge=1, le=500)
347+
348+
349+
class AlertSeverityCount(BaseModel):
350+
"""Count of alerts by severity level."""
351+
352+
critical: int
353+
high: int
354+
medium: int
355+
low: int
356+
357+
358+
class AlertSummarySchema(BaseModel):
359+
"""Alert summary statistics."""
360+
361+
by_status: dict[str, int]
362+
by_type: dict[str, int]
363+
by_severity: AlertSeverityCount
364+
total: int
365+
366+
367+
class AlertTrendingPointSchema(BaseModel):
368+
"""Single data point for alert trending."""
369+
370+
date: str
371+
count: int
372+
373+
374+
class AlertTrendingSchema(BaseModel):
375+
"""Alert volume over time."""
376+
377+
trending: list[AlertTrendingPointSchema]
378+
days: int
379+
380+
381+
class AlertListFilteredResponse(BaseModel):
382+
"""Response schema for filtered alert listing."""
383+
384+
alerts: list[AnomalyAlertSchema]
385+
total: int
386+
filters: AlertFilters
387+
388+
313389
class FixNoteRequest(BaseModel):
314390
"""Request schema for adding/updating a fix note."""
315391

@@ -405,3 +481,48 @@ class SimilarFailuresResponse(BaseModel):
405481
failure_event_id: str
406482
similar_failures: list[SimilarFailureSchema]
407483
total: int
484+
485+
486+
# ------------------------------------------------------------------
487+
# Alert policy schemas
488+
# ------------------------------------------------------------------
489+
490+
491+
class AlertPolicyCreate(BaseModel):
492+
"""Request schema for creating an alert policy."""
493+
494+
agent_name: str | None = Field(default=None, max_length=255)
495+
alert_type: str = Field(min_length=1, max_length=64)
496+
threshold_value: float = Field(ge=0.0)
497+
severity_threshold: str | None = Field(default=None, max_length=16)
498+
enabled: bool = Field(default=True)
499+
500+
501+
class AlertPolicyUpdate(BaseModel):
502+
"""Request schema for updating an alert policy."""
503+
504+
agent_name: str | None = Field(default=None, max_length=255)
505+
alert_type: str | None = Field(default=None, min_length=1, max_length=64)
506+
threshold_value: float | None = Field(default=None, ge=0.0)
507+
severity_threshold: str | None = Field(default=None, max_length=16)
508+
enabled: bool | None = None
509+
510+
511+
class AlertPolicySchema(BaseModel):
512+
"""Response schema for alert policies."""
513+
514+
id: str
515+
agent_name: str | None
516+
alert_type: str
517+
threshold_value: float
518+
severity_threshold: str | None
519+
enabled: bool
520+
created_at: datetime
521+
updated_at: datetime
522+
523+
524+
class AlertPolicyListResponse(BaseModel):
525+
"""Response schema for listing alert policies."""
526+
527+
policies: list[AlertPolicySchema]
528+
total: int

0 commit comments

Comments
 (0)