Skip to content

Commit fdfd112

Browse files
committed
Merge branch 'feature/error-analyzer-2' into 'develop'
Feature/Monitoring foundation library See merge request genaiic-reusable-assets/engagement-artifacts/genaiic-idp-accelerator!604
2 parents 6a544e7 + 6c20862 commit fdfd112

12 files changed

Lines changed: 3368 additions & 0 deletions

File tree

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
# SPDX-License-Identifier: MIT-0
3+
4+
"""
5+
``idp_common.monitoring`` — Shared monitoring foundation library.
6+
7+
Provides reusable building blocks for all monitoring features across the IDP
8+
system. All public classes and functions are exported from this package so
9+
consumers only need a single import path:
10+
11+
from idp_common.monitoring import SettingsCache, TimeRange, DocumentRecord
12+
13+
Modules
14+
-------
15+
models
16+
Shared dataclasses: :class:`TimeRange`, :class:`LogEvent`,
17+
:class:`LogSearchResult`, :class:`TraceSegment`, :class:`DocumentRecord`,
18+
:class:`MonitoringKPIs`.
19+
20+
settings_cache
21+
:class:`SettingsCache` — TTL-based SSM/DynamoDB configuration cache.
22+
Module-level helpers: :func:`get_setting`, :func:`get_cloudwatch_log_groups`.
23+
24+
stack_utils
25+
Stack name resolution and AWS resource discovery utilities:
26+
:func:`get_stack_name`, :func:`extract_stack_name_from_arn`,
27+
:func:`get_stack_resources`, :func:`get_lambda_function_names`,
28+
:func:`get_state_machine_arn`.
29+
30+
stepfunctions_service
31+
Step Functions execution analysis:
32+
:func:`get_execution_arn_from_document`, :func:`get_execution_data`,
33+
:func:`analyze_execution_timeline`, :func:`extract_failure_details`.
34+
35+
xray_service
36+
X-Ray base trace service:
37+
:func:`get_trace_for_document`, :func:`analyze_trace`,
38+
:func:`get_subsegment_details`, :func:`extract_lambda_request_ids`.
39+
"""
40+
41+
# ---------------------------------------------------------------------------
42+
# Models
43+
# ---------------------------------------------------------------------------
44+
from idp_common.monitoring.models import (
45+
DocumentRecord,
46+
LogEvent,
47+
LogSearchResult,
48+
MonitoringKPIs,
49+
TimeRange,
50+
TraceSegment,
51+
)
52+
53+
# ---------------------------------------------------------------------------
54+
# Settings cache
55+
# ---------------------------------------------------------------------------
56+
from idp_common.monitoring.settings_cache import (
57+
SettingsCache,
58+
get_cloudwatch_log_groups,
59+
get_setting,
60+
)
61+
62+
# ---------------------------------------------------------------------------
63+
# Stack utilities
64+
# ---------------------------------------------------------------------------
65+
from idp_common.monitoring.stack_utils import (
66+
extract_stack_name_from_arn,
67+
get_lambda_function_names,
68+
get_stack_name,
69+
get_stack_resources,
70+
get_state_machine_arn,
71+
)
72+
73+
# ---------------------------------------------------------------------------
74+
# Step Functions service
75+
# ---------------------------------------------------------------------------
76+
from idp_common.monitoring.stepfunctions_service import (
77+
analyze_execution_timeline,
78+
extract_failure_details,
79+
get_execution_arn_from_document,
80+
get_execution_data,
81+
)
82+
83+
# ---------------------------------------------------------------------------
84+
# X-Ray service
85+
# ---------------------------------------------------------------------------
86+
from idp_common.monitoring.xray_service import (
87+
analyze_trace,
88+
extract_lambda_request_ids,
89+
get_subsegment_details,
90+
get_trace_for_document,
91+
)
92+
93+
__all__ = [
94+
# models
95+
"TimeRange",
96+
"LogEvent",
97+
"LogSearchResult",
98+
"TraceSegment",
99+
"DocumentRecord",
100+
"MonitoringKPIs",
101+
# settings_cache
102+
"SettingsCache",
103+
"get_setting",
104+
"get_cloudwatch_log_groups",
105+
# stack_utils
106+
"get_stack_name",
107+
"extract_stack_name_from_arn",
108+
"get_stack_resources",
109+
"get_lambda_function_names",
110+
"get_state_machine_arn",
111+
# stepfunctions_service
112+
"get_execution_arn_from_document",
113+
"get_execution_data",
114+
"analyze_execution_timeline",
115+
"extract_failure_details",
116+
# xray_service
117+
"get_trace_for_document",
118+
"analyze_trace",
119+
"get_subsegment_details",
120+
"extract_lambda_request_ids",
121+
]
Lines changed: 263 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,263 @@
1+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
# SPDX-License-Identifier: MIT-0
3+
4+
"""
5+
Shared data models for IDP monitoring services.
6+
7+
All monitoring features use these standard structures when passing data around,
8+
ensuring consistent types across the codebase.
9+
"""
10+
11+
from __future__ import annotations
12+
13+
from dataclasses import dataclass, field
14+
from datetime import datetime, timedelta, timezone
15+
from typing import Any, Dict, List
16+
17+
# ---------------------------------------------------------------------------
18+
# Module-level constants
19+
# ---------------------------------------------------------------------------
20+
21+
# Explicit set of status values that represent an actively running document.
22+
# Using an allowlist (rather than excluding terminal states) ensures that
23+
# unrecognised values such as "UNKNOWN" or "" are never treated as in-progress.
24+
_IN_PROGRESS_STATUSES: frozenset[str] = frozenset({"IN_PROGRESS", "RUNNING", "STARTED"})
25+
26+
27+
@dataclass
28+
class TimeRange:
29+
"""
30+
A closed time interval expressed as ISO 8601 UTC strings.
31+
32+
Example::
33+
34+
tr = TimeRange.last_n_hours(24)
35+
print(tr.start_time) # "2026-03-25T12:00:00.000000Z"
36+
"""
37+
38+
start_time: str # ISO 8601, e.g. "2026-03-25T12:00:00.000000Z"
39+
end_time: str # ISO 8601
40+
41+
@classmethod
42+
def last_n_hours(cls, hours: int = 24) -> "TimeRange":
43+
"""Return a TimeRange covering the last *hours* hours up to now (UTC)."""
44+
end = datetime.now(timezone.utc)
45+
start = end - timedelta(hours=hours)
46+
return cls(
47+
start_time=start.strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
48+
end_time=end.strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
49+
)
50+
51+
@classmethod
52+
def from_datetimes(cls, start: datetime, end: datetime) -> "TimeRange":
53+
"""Create a TimeRange from two datetime objects (converted to UTC)."""
54+
if start.tzinfo is None:
55+
start = start.replace(tzinfo=timezone.utc)
56+
if end.tzinfo is None:
57+
end = end.replace(tzinfo=timezone.utc)
58+
return cls(
59+
start_time=start.strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
60+
end_time=end.strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
61+
)
62+
63+
def to_datetimes(self) -> tuple[datetime, datetime]:
64+
"""Return (start, end) as timezone-aware datetime objects.
65+
66+
Accepts timestamps with or without microseconds (e.g. both
67+
``"2026-03-25T12:00:00Z"`` and ``"2026-03-25T12:00:00.123456Z"``).
68+
"""
69+
70+
def _parse(ts: str) -> datetime:
71+
return datetime.fromisoformat(ts.replace("Z", "+00:00"))
72+
73+
return _parse(self.start_time), _parse(self.end_time)
74+
75+
def duration_hours(self) -> float:
76+
"""Return the duration of this time range in hours."""
77+
start, end = self.to_datetimes()
78+
return (end - start).total_seconds() / 3600
79+
80+
81+
@dataclass
82+
class LogEvent:
83+
"""A single CloudWatch log event."""
84+
85+
timestamp: str # ISO 8601
86+
message: str
87+
log_group: str
88+
log_stream: str
89+
request_id: str = ""
90+
91+
92+
@dataclass
93+
class LogSearchResult:
94+
"""Result of a CloudWatch log search across one or more log groups."""
95+
96+
events: List[LogEvent] = field(default_factory=list)
97+
log_groups_searched: List[str] = field(default_factory=list)
98+
total_events: int = 0
99+
search_duration_ms: float = 0.0
100+
101+
102+
@dataclass
103+
class TraceSegment:
104+
"""A single segment (or subsegment) within an AWS X-Ray trace."""
105+
106+
id: str
107+
name: str
108+
start_time: float # Unix epoch seconds
109+
end_time: float # Unix epoch seconds
110+
duration_ms: float
111+
has_error: bool
112+
has_throttle: bool
113+
error_message: str = ""
114+
origin: str = "" # e.g. "AWS::Lambda", "AWS::Bedrock"
115+
subsegments: List["TraceSegment"] = field(default_factory=list)
116+
117+
@classmethod
118+
def from_xray_document(cls, doc: Dict[str, Any]) -> "TraceSegment":
119+
"""Parse an X-Ray segment document dict into a TraceSegment."""
120+
start = doc.get("start_time", 0.0)
121+
end = doc.get("end_time", 0.0)
122+
duration_ms = (end - start) * 1000
123+
124+
# Extract error message from cause block if present
125+
error_message = ""
126+
cause = doc.get("cause", {})
127+
if cause and isinstance(cause, dict):
128+
exceptions = cause.get("exceptions", [])
129+
if exceptions:
130+
error_message = exceptions[0].get("message", "")
131+
132+
return cls(
133+
id=doc.get("id", ""),
134+
name=doc.get("name", ""),
135+
start_time=start,
136+
end_time=end,
137+
duration_ms=round(duration_ms, 1),
138+
has_error=bool(doc.get("error") or doc.get("fault")),
139+
has_throttle=bool(doc.get("throttle")),
140+
error_message=error_message,
141+
origin=doc.get("origin", ""),
142+
)
143+
144+
145+
@dataclass
146+
class DocumentRecord:
147+
"""
148+
Represents a document processing record from the IDP tracking DynamoDB table.
149+
150+
Field names follow the DynamoDB attribute naming used in the tracking table.
151+
"""
152+
153+
object_key: str # S3 object key / document ID
154+
status: str # COMPLETED | FAILED | IN_PROGRESS | ABORTED
155+
156+
# Timestamps (ISO 8601 strings; empty string if not yet set)
157+
queued_time: str = ""
158+
start_time: str = ""
159+
completion_time: str = ""
160+
161+
# Tracing
162+
workflow_execution_arn: str = ""
163+
trace_id: str = ""
164+
165+
# Processing metadata
166+
num_pages: int = 0
167+
document_class: str = ""
168+
config_version: str = ""
169+
170+
# Raw DynamoDB item (preserved for downstream access to any field)
171+
raw: Dict[str, Any] = field(default_factory=dict)
172+
173+
@classmethod
174+
def from_dynamodb_item(cls, item: Dict[str, Any]) -> "DocumentRecord":
175+
"""
176+
Construct a DocumentRecord from a raw DynamoDB tracking table item.
177+
178+
Handles both camelCase DynamoDB attribute names (ObjectStatus,
179+
WorkflowExecutionArn, etc.) as stored in the table.
180+
"""
181+
object_key = item.get("PK", "").replace("doc#", "") or item.get("ObjectKey", "")
182+
status = item.get("ObjectStatus") or item.get("WorkflowStatus") or "UNKNOWN"
183+
184+
return cls(
185+
object_key=object_key,
186+
status=status,
187+
queued_time=item.get("InitialEventTime", ""),
188+
start_time=item.get("StartTime", ""),
189+
completion_time=item.get("CompletionTime", ""),
190+
workflow_execution_arn=(
191+
item.get("WorkflowExecutionArn") or item.get("ExecutionArn", "")
192+
),
193+
trace_id=item.get("TraceId", ""),
194+
num_pages=int(item.get("NumPages", 0) or 0),
195+
document_class=item.get("DocumentClass", ""),
196+
config_version=item.get("ConfigVersion", ""),
197+
raw=item,
198+
)
199+
200+
def is_failed(self) -> bool:
201+
"""Return True if the document processing failed."""
202+
return self.status == "FAILED"
203+
204+
def is_completed(self) -> bool:
205+
"""Return True if the document was processed successfully."""
206+
return self.status == "COMPLETED"
207+
208+
def is_in_progress(self) -> bool:
209+
"""Return True if the document is currently being processed.
210+
211+
Uses an explicit allowlist rather than an exclusion list so that
212+
unrecognised or default status values (e.g. ``"UNKNOWN"``, ``""``)
213+
are not mistakenly treated as active.
214+
"""
215+
return self.status in _IN_PROGRESS_STATUSES
216+
217+
218+
@dataclass
219+
class MonitoringKPIs:
220+
"""
221+
Aggregated key performance indicators for a monitoring time window.
222+
223+
Used by the Dashboard Service and the UI monitoring page.
224+
"""
225+
226+
# Volume
227+
total_documents: int = 0
228+
total_pages: int = 0
229+
total_input_tokens: int = 0
230+
total_output_tokens: int = 0
231+
232+
# Status breakdown
233+
completed_documents: int = 0
234+
failed_documents: int = 0
235+
in_progress_documents: int = 0
236+
aborted_documents: int = 0
237+
238+
# Cost
239+
total_cost: float = 0.0
240+
avg_cost_per_document: float = 0.0
241+
242+
# Health
243+
failure_rate: float = 0.0 # 0.0–1.0
244+
245+
# Configuration
246+
active_config_count: int = 0
247+
248+
def compute_derived(self) -> None:
249+
"""Recompute failure_rate and avg_cost_per_document from raw counts.
250+
251+
avg_cost_per_document is always (re-)assigned inside the
252+
``total_documents > 0`` branch so that a subsequent call with
253+
``total_cost = 0`` correctly resets it to 0.0 rather than leaving
254+
a stale non-zero value from a previous computation.
255+
"""
256+
if self.total_documents > 0:
257+
self.failure_rate = self.failed_documents / self.total_documents
258+
self.avg_cost_per_document = (
259+
self.total_cost / self.total_documents if self.total_cost > 0 else 0.0
260+
)
261+
else:
262+
self.failure_rate = 0.0
263+
self.avg_cost_per_document = 0.0

0 commit comments

Comments
 (0)