This comprehensive monitoring and analytics guide covers all aspects of monitoring, observability, and analytics for the Aurora AI framework. With 27 integrated systems and 74 API endpoints, this guide provides enterprise-grade monitoring strategies, analytics implementations, and operational intelligence.
# monitoring/stack.yaml
monitoring_stack:
data_collection:
- prometheus_server
- grafana_dashboards
- elk_stack
- jaeger_tracing
alerting:
- alertmanager
- pagerduty_integration
- slack_notifications
- email_alerts
analytics:
- time_series_database
- machine_learning_analytics
- anomaly_detection
- predictive_analytics
visualization:
- grafana_dashboards
- kibana_dashboards
- custom_dashboards
- real_time_visualizations# monitoring/prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: 'aurora-ai'
replica: 'prometheus-1'
rule_files:
- "aurora_rules.yml"
- "security_rules.yml"
- "performance_rules.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
# Aurora AI Application Metrics
- job_name: 'aurora-web'
static_configs:
- targets: ['aurora-web:8080']
metrics_path: '/api/monitoring/metrics'
scrape_interval: 30s
scrape_timeout: 10s
# Relabeling for better metric organization
relabel_configs:
- source_labels: [__address__]
target_label: instance
replacement: 'aurora-web-1'
- source_labels: [__address__]
target_label: service
replacement: 'aurora-ai'
# Database Metrics
- job_name: 'postgres-exporter'
static_configs:
- targets: ['postgres-exporter:9187']
scrape_interval: 30s
# Redis Metrics
- job_name: 'redis-exporter'
static_configs:
- targets: ['redis-exporter:9121']
scrape_interval: 30s
# Kubernetes Metrics
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
# Node Exporter
- job_name: 'node-exporter'
static_configs:
- targets: ['node-exporter:9100']
scrape_interval: 30s{
"dashboard": {
"id": null,
"title": "Aurora AI - System Overview",
"tags": ["aurora-ai", "system"],
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "System Health Score",
"type": "stat",
"targets": [
{
"expr": "aurora_system_health_score",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"thresholds": {
"steps": [
{"color": "red", "value": 0},
{"color": "yellow", "value": 70},
{"color": "green", "value": 90}
]
}
}
},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}
},
{
"id": 2,
"title": "API Request Rate",
"type": "graph",
"targets": [
{
"expr": "rate(aurora_api_requests_total[5m])",
"refId": "A",
"legendFormat": "{{method}} {{endpoint}}"
}
],
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}
},
{
"id": 3,
"title": "Response Time Distribution",
"type": "heatmap",
"targets": [
{
"expr": "rate(aurora_api_request_duration_seconds_bucket[5m])",
"refId": "A"
}
],
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 8}
},
{
"id": 4,
"title": "Error Rate",
"type": "graph",
"targets": [
{
"expr": "rate(aurora_api_requests_total{status=~\"5..\"}[5m]) / rate(aurora_api_requests_total[5m])",
"refId": "A",
"legendFormat": "Error Rate"
}
],
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16}
},
{
"id": 5,
"title": "Resource Utilization",
"type": "graph",
"targets": [
{
"expr": "rate(container_cpu_usage_seconds_total[5m])",
"refId": "A",
"legendFormat": "CPU Usage"
},
{
"expr": "container_memory_usage_bytes / 1024 / 1024",
"refId": "B",
"legendFormat": "Memory Usage (MB)"
}
],
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 16}
}
],
"time": {"from": "now-1h", "to": "now"},
"refresh": "30s"
}
}# monitoring/metrics_collector.py
from prometheus_client import Counter, Histogram, Gauge, start_http_server
import time
import functools
from flask import request
import psutil
import threading
class AuroraMetricsCollector:
def __init__(self):
# Request metrics
self.request_count = Counter(
'aurora_api_requests_total',
'Total API requests',
['method', 'endpoint', 'status']
)
self.request_duration = Histogram(
'aurora_api_request_duration_seconds',
'API request duration',
['method', 'endpoint'],
buckets=[0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0]
)
# System metrics
self.system_health = Gauge(
'aurora_system_health_score',
'System health score (0-100)'
)
self.active_connections = Gauge(
'aurora_active_connections',
'Number of active connections'
)
self.model_performance = Gauge(
'aurora_model_performance_score',
'Model performance score',
['model_id']
)
# Business metrics
self.predictions_made = Counter(
'aurora_predictions_total',
'Total predictions made',
['model_id', 'prediction_type']
)
self.data_validations = Counter(
'aurora_data_validations_total',
'Total data validations',
['validation_type', 'result']
)
self.training_jobs = Counter(
'aurora_training_jobs_total',
'Total training jobs',
['status', 'algorithm']
)
# Start metrics server
start_http_server(8000)
# Start background metrics collection
self.start_background_collection()
def track_request(self, func):
"""Decorator to track API requests"""
@functools.wraps(func)
def wrapper(*args, **kwargs):
start_time = time.time()
try:
response = func(*args, **kwargs)
status = getattr(response, 'status_code', 200)
return response
except Exception as e:
status = 500
raise
finally:
duration = time.time() - start_time
# Record metrics
self.request_count.labels(
method=request.method,
endpoint=request.endpoint,
status=status
).inc()
self.request_duration.labels(
method=request.method,
endpoint=request.endpoint
).observe(duration)
return wrapper
def track_prediction(self, model_id, prediction_type, success=True):
"""Track prediction metrics"""
self.predictions_made.labels(
model_id=model_id,
prediction_type=prediction_type
).inc()
def track_data_validation(self, validation_type, result):
"""Track data validation metrics"""
self.data_validations.labels(
validation_type=validation_type,
result=result
).inc()
def track_training_job(self, status, algorithm):
"""Track training job metrics"""
self.training_jobs.labels(
status=status,
algorithm=algorithm
).inc()
def update_system_health(self, health_score):
"""Update system health score"""
self.system_health.set(health_score)
def update_active_connections(self, count):
"""Update active connections count"""
self.active_connections.set(count)
def update_model_performance(self, model_id, performance_score):
"""Update model performance score"""
self.model_performance.labels(model_id=model_id).set(performance_score)
def start_background_collection(self):
"""Start background metrics collection"""
def collect_system_metrics():
while True:
try:
# CPU usage
cpu_percent = psutil.cpu_percent(interval=1)
# Memory usage
memory = psutil.virtual_memory()
memory_percent = memory.percent
# Disk usage
disk = psutil.disk_usage('/')
disk_percent = disk.percent
# Network I/O
network = psutil.net_io_counters()
# Update custom metrics
self.update_system_health(
self.calculate_health_score(cpu_percent, memory_percent, disk_percent)
)
except Exception as e:
print(f"Error collecting system metrics: {e}")
time.sleep(30) # Collect every 30 seconds
# Start background thread
thread = threading.Thread(target=collect_system_metrics, daemon=True)
thread.start()
def calculate_health_score(self, cpu_percent, memory_percent, disk_percent):
"""Calculate overall system health score"""
# Weight different factors
cpu_weight = 0.3
memory_weight = 0.3
disk_weight = 0.4
# Calculate individual scores (inverse of usage)
cpu_score = max(0, 100 - cpu_percent)
memory_score = max(0, 100 - memory_percent)
disk_score = max(0, 100 - disk_percent)
# Calculate weighted average
health_score = (
cpu_score * cpu_weight +
memory_score * memory_weight +
disk_score * disk_weight
)
return round(health_score, 2)# monitoring/business_metrics.py
class BusinessMetricsTracker:
def __init__(self, metrics_collector):
self.metrics = metrics_collector
self.user_engagement = {}
self.feature_usage = {}
def track_user_engagement(self, user_id, action, feature):
"""Track user engagement metrics"""
timestamp = time.time()
if user_id not in self.user_engagement:
self.user_engagement[user_id] = {
'first_seen': timestamp,
'last_seen': timestamp,
'actions': [],
'features_used': set()
}
self.user_engagement[user_id]['last_seen'] = timestamp
self.user_engagement[user_id]['actions'].append({
'timestamp': timestamp,
'action': action,
'feature': feature
})
self.user_engagement[user_id]['features_used'].add(feature)
# Update Prometheus metrics
self.metrics.user_engagement.labels(
action=action,
feature=feature
).inc()
def track_feature_usage(self, feature, usage_type, value=1):
"""Track feature usage metrics"""
if feature not in self.feature_usage:
self.feature_usage[feature] = {
'total_usage': 0,
'usage_types': {},
'first_used': time.time(),
'last_used': time.time()
}
self.feature_usage[feature]['total_usage'] += value
self.feature_usage[feature]['last_used'] = time.time()
if usage_type not in self.feature_usage[feature]['usage_types']:
self.feature_usage[feature]['usage_types'][usage_type] = 0
self.feature_usage[feature]['usage_types'][usage_type] += value
# Update Prometheus metrics
self.metrics.feature_usage.labels(
feature=feature,
usage_type=usage_type
).inc()
def generate_engagement_report(self, hours=24):
"""Generate user engagement report"""
cutoff_time = time.time() - (hours * 3600)
active_users = 0
total_actions = 0
feature_usage = {}
for user_id, engagement in self.user_engagement.items():
if engagement['last_seen'] > cutoff_time:
active_users += 1
total_actions += len(engagement['actions'])
for feature in engagement['features_used']:
if feature not in feature_usage:
feature_usage[feature] = 0
feature_usage[feature] += 1
return {
'period_hours': hours,
'active_users': active_users,
'total_actions': total_actions,
'actions_per_user': total_actions / active_users if active_users > 0 else 0,
'feature_usage': feature_usage,
'most_used_feature': max(feature_usage.items(), key=lambda x: x[1])[0] if feature_usage else None
}# monitoring/anomaly_detection.py
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import pandas as pd
from datetime import datetime, timedelta
class AnomalyDetectionEngine:
def __init__(self):
self.models = {}
self.scalers = {}
self.anomaly_thresholds = {
'response_time': 5.0, # seconds
'error_rate': 0.05, # 5%
'cpu_usage': 0.90, # 90%
'memory_usage': 0.85 # 85%
}
self.anomaly_history = []
def train_anomaly_models(self, metrics_data):
"""Train anomaly detection models on historical data"""
df = pd.DataFrame(metrics_data)
# Train models for different metric types
metric_columns = ['cpu_usage', 'memory_usage', 'disk_usage',
'response_time', 'error_rate', 'throughput']
for metric in metric_columns:
if metric in df.columns:
# Prepare data
data = df[metric].values.reshape(-1, 1)
# Scale data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)
# Train Isolation Forest
model = IsolationForest(
contamination=0.1,
random_state=42,
n_estimators=100
)
model.fit(scaled_data)
# Store model and scaler
self.models[metric] = model
self.scalers[metric] = scaler
def detect_anomalies(self, current_metrics):
"""Detect anomalies in current metrics"""
anomalies = []
for metric, value in current_metrics.items():
if metric in self.models:
# Prepare data
data = np.array([[value]])
scaled_data = self.scalers[metric].transform(data)
# Predict anomaly
prediction = self.models[metric].predict(scaled_data)[0]
anomaly_score = self.models[metric].decision_function(scaled_data)[0]
# Check if anomaly
is_anomaly = prediction == -1
# Also check against static thresholds
threshold_anomaly = False
if metric in self.anomaly_thresholds:
threshold = self.anomaly_thresholds[metric]
if metric in ['response_time', 'error_rate', 'cpu_usage', 'memory_usage']:
threshold_anomaly = value > threshold
if is_anomaly or threshold_anomaly:
anomalies.append({
'metric': metric,
'value': value,
'anomaly_score': anomaly_score,
'threshold_violation': threshold_anomaly,
'severity': self._calculate_severity(metric, value, anomaly_score),
'timestamp': datetime.utcnow().isoformat()
})
# Store anomalies
self.anomaly_history.extend(anomalies)
# Keep only last 1000 anomalies
if len(self.anomaly_history) > 1000:
self.anomaly_history = self.anomaly_history[-1000:]
return anomalies
def _calculate_severity(self, metric, value, anomaly_score):
"""Calculate anomaly severity"""
base_severity = abs(anomaly_score)
# Adjust based on metric importance
critical_metrics = ['error_rate', 'cpu_usage', 'memory_usage']
if metric in critical_metrics:
base_severity *= 1.5
# Determine severity level
if base_severity > 0.8:
return 'critical'
elif base_severity > 0.5:
return 'high'
elif base_severity > 0.3:
return 'medium'
else:
return 'low'
def get_anomaly_summary(self, hours=24):
"""Get anomaly summary for specified period"""
cutoff_time = datetime.utcnow() - timedelta(hours=hours)
recent_anomalies = [
anomaly for anomaly in self.anomaly_history
if datetime.fromisoformat(anomaly['timestamp']) > cutoff_time
]
if not recent_anomalies:
return {
'period_hours': hours,
'total_anomalies': 0,
'severity_breakdown': {},
'metric_breakdown': {},
'trend': 'stable'
}
# Analyze anomalies
severity_breakdown = {}
metric_breakdown = {}
for anomaly in recent_anomalies:
severity = anomaly['severity']
metric = anomaly['metric']
severity_breakdown[severity] = severity_breakdown.get(severity, 0) + 1
metric_breakdown[metric] = metric_breakdown.get(metric, 0) + 1
# Determine trend
if len(recent_anomalies) > 10:
recent_count = len([a for a in recent_anomalies
if datetime.fromisoformat(a['timestamp']) >
datetime.utcnow() - timedelta(hours=1)])
if recent_count > 5:
trend = 'increasing'
elif recent_count < 2:
trend = 'decreasing'
else:
trend = 'stable'
else:
trend = 'insufficient_data'
return {
'period_hours': hours,
'total_anomalies': len(recent_anomalies),
'severity_breakdown': severity_breakdown,
'metric_breakdown': metric_breakdown,
'trend': trend,
'recommendations': self._generate_anomaly_recommendations(recent_anomalies)
}
def _generate_anomaly_recommendations(self, anomalies):
"""Generate recommendations based on anomalies"""
recommendations = []
# Group anomalies by type
critical_anomalies = [a for a in anomalies if a['severity'] == 'critical']
high_anomalies = [a for a in anomalies if a['severity'] == 'high']
if critical_anomalies:
recommendations.append({
'priority': 'critical',
'issue': 'Critical anomalies detected',
'action': 'Immediate investigation required',
'affected_metrics': list(set(a['metric'] for a in critical_anomalies))
})
if len(anomalies) > 20:
recommendations.append({
'priority': 'high',
'issue': 'High anomaly volume',
'action': 'Review system performance and capacity',
'affected_metrics': list(set(a['metric'] for a in anomalies))
})
# Check for specific metric patterns
error_anomalies = [a for a in anomalies if a['metric'] == 'error_rate']
if len(error_anomalies) > 5:
recommendations.append({
'priority': 'high',
'issue': 'Elevated error rate anomalies',
'action': 'Investigate application errors and logs',
'affected_metrics': ['error_rate']
})
return recommendations# monitoring/predictive_analytics.py
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
class PredictiveAnalyticsEngine:
def __init__(self):
self.models = {}
self.feature_columns = [
'hour_of_day', 'day_of_week', 'cpu_usage', 'memory_usage',
'active_connections', 'requests_per_minute'
]
self.target_columns = ['response_time', 'error_rate', 'throughput']
def train_predictive_models(self, historical_data):
"""Train predictive models on historical data"""
df = pd.DataFrame(historical_data)
# Prepare features
df['hour_of_day'] = pd.to_datetime(df['timestamp']).dt.hour
df['day_of_week'] = pd.to_datetime(df['timestamp']).dt.dayofweek
# Train models for each target
for target in self.target_columns:
if target in df.columns:
# Prepare training data
X = df[self.feature_columns].fillna(0)
y = df[target].fillna(0)
# Split data (80% train, 20% test)
split_idx = int(len(X) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]
# Train Random Forest model
model = RandomForestRegressor(
n_estimators=100,
random_state=42,
max_depth=10
)
model.fit(X_train, y_train)
# Evaluate model
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
# Store model
self.models[target] = {
'model': model,
'mae': mae,
'mse': mse,
'feature_importance': dict(zip(self.feature_columns, model.feature_importances_))
}
def predict_metrics(self, current_data, horizon_minutes=60):
"""Predict metrics for specified horizon"""
predictions = {}
# Prepare features from current data
features = self._prepare_features(current_data)
for target, model_info in self.models.items():
model = model_info['model']
# Predict for future time points
future_predictions = []
for minute_ahead in range(0, horizon_minutes + 1, 15): # Every 15 minutes
# Add time-based features
future_time = datetime.utcnow() + timedelta(minutes=minute_ahead)
future_features = features.copy()
future_features['hour_of_day'] = future_time.hour
future_features['day_of_week'] = future_time.weekday()
# Make prediction
feature_array = np.array([list(future_features.values())])
prediction = model.predict(feature_array)[0]
future_predictions.append({
'timestamp': (future_time).isoformat(),
'minutes_ahead': minute_ahead,
'predicted_value': max(0, prediction), # Ensure non-negative
'confidence': self._calculate_confidence(model_info, prediction)
})
predictions[target] = {
'predictions': future_predictions,
'model_accuracy': {
'mae': model_info['mae'],
'mse': model_info['mse']
},
'feature_importance': model_info['feature_importance']
}
return predictions
def _prepare_features(self, current_data):
"""Prepare features from current data"""
features = {}
# Current metrics
features['cpu_usage'] = current_data.get('cpu_usage', 0)
features['memory_usage'] = current_data.get('memory_usage', 0)
features['active_connections'] = current_data.get('active_connections', 0)
features['requests_per_minute'] = current_data.get('requests_per_minute', 0)
# Time-based features
now = datetime.utcnow()
features['hour_of_day'] = now.hour
features['day_of_week'] = now.weekday()
return features
def _calculate_confidence(self, model_info, prediction):
"""Calculate prediction confidence based on model accuracy"""
mae = model_info['mae']
# Simple confidence calculation based on MAE
if mae < 0.1:
return 0.9 # High confidence
elif mae < 0.5:
return 0.7 # Medium confidence
else:
return 0.5 # Low confidence
def generate_insights(self, predictions):
"""Generate insights from predictions"""
insights = []
for target, prediction_data in predictions.items():
future_values = [p['predicted_value'] for p in prediction_data['predictions']]
# Calculate trend
if len(future_values) > 1:
trend = np.polyfit(range(len(future_values)), future_values, 1)[0]
if trend > 0.1:
trend_direction = 'increasing'
elif trend < -0.1:
trend_direction = 'decreasing'
else:
trend_direction = 'stable'
# Check for concerning trends
if target == 'error_rate' and trend_direction == 'increasing':
insights.append({
'type': 'warning',
'metric': target,
'message': f'Error rate predicted to {trend_direction}',
'severity': 'high' if trend > 0.5 else 'medium'
})
if target == 'response_time' and trend_direction == 'increasing':
insights.append({
'type': 'warning',
'metric': target,
'message': f'Response time predicted to {trend_direction}',
'severity': 'medium'
})
if target == 'throughput' and trend_direction == 'decreasing':
insights.append({
'type': 'warning',
'metric': target,
'message': f'Throughput predicted to {trend_direction}',
'severity': 'medium'
})
return insights# monitoring/dashboard_provider.py
class DashboardDataProvider:
def __init__(self, metrics_collector, anomaly_detector, predictive_analytics):
self.metrics_collector = metrics_collector
self.anomaly_detector = anomaly_detector
self.predictive_analytics = predictive_analytics
def get_dashboard_data(self):
"""Get comprehensive dashboard data"""
return {
'overview': self._get_overview_metrics(),
'performance': self._get_performance_metrics(),
'anomalies': self._get_anomaly_data(),
'predictions': self._get_prediction_data(),
'alerts': self._get_active_alerts(),
'system_health': self._get_system_health()
}
def _get_overview_metrics(self):
"""Get overview metrics"""
return {
'total_requests': self._get_metric_value('aurora_api_requests_total'),
'active_users': self._get_active_users_count(),
'system_health': self._get_system_health_score(),
'uptime_percentage': self._get_uptime_percentage(),
'error_rate': self._get_current_error_rate(),
'average_response_time': self._get_average_response_time()
}
def _get_performance_metrics(self):
"""Get detailed performance metrics"""
return {
'request_rate': self._get_request_rate(),
'response_time_distribution': self._get_response_time_distribution(),
'throughput': self._get_throughput(),
'resource_utilization': self._get_resource_utilization(),
'database_performance': self._get_database_performance(),
'cache_performance': self._get_cache_performance()
}
def _get_anomaly_data(self):
"""Get anomaly detection data"""
anomaly_summary = self.anomaly_detector.get_anomaly_summary(hours=24)
return {
'summary': anomaly_summary,
'recent_anomalies': self.anomaly_detector.anomaly_history[-10:],
'anomaly_trends': self._get_anomaly_trends(),
'recommendations': anomaly_summary.get('recommendations', [])
}
def _get_prediction_data(self):
"""Get predictive analytics data"""
current_data = self._get_current_metrics()
predictions = self.predictive_analytics.predict_metrics(current_data, horizon_minutes=60)
return {
'predictions': predictions,
'insights': self.predictive_analytics.generate_insights(predictions),
'model_accuracy': self._get_model_accuracy()
}
def _get_active_alerts(self):
"""Get active alerts"""
return {
'critical_alerts': self._get_critical_alerts(),
'warning_alerts': self._get_warning_alerts(),
'info_alerts': self._get_info_alerts(),
'alert_trends': self._get_alert_trends()
}
def _get_system_health(self):
"""Get detailed system health"""
return {
'overall_health': self._get_system_health_score(),
'component_health': {
'web_server': self._get_component_health('web_server'),
'database': self._get_component_health('database'),
'cache': self._get_component_health('cache'),
'ml_models': self._get_component_health('ml_models')
},
'health_trends': self._get_health_trends(),
'health_recommendations': self._get_health_recommendations()
}
def _get_metric_value(self, metric_name):
"""Get current value for a metric"""
# This would integrate with Prometheus client
return 0 # Placeholder
def _get_active_users_count(self):
"""Get active users count"""
return 1250 # Placeholder
def _get_system_health_score(self):
"""Get system health score"""
return 92.5 # Placeholder
def _get_uptime_percentage(self):
"""Get uptime percentage"""
return 99.9 # Placeholder
def _get_current_error_rate(self):
"""Get current error rate"""
return 0.02 # Placeholder
def _get_average_response_time(self):
"""Get average response time"""
return 0.8 # Placeholder
def _get_request_rate(self):
"""Get current request rate"""
return 850 # Placeholder
def _get_response_time_distribution(self):
"""Get response time distribution"""
return {
'p50': 0.5,
'p95': 1.2,
'p99': 2.8,
'max': 5.5
}
def _get_throughput(self):
"""Get current throughput"""
return 850 # Placeholder
def _get_resource_utilization(self):
"""Get resource utilization"""
return {
'cpu': 45.2,
'memory': 67.8,
'disk': 23.4,
'network': 12.1
}
def _get_database_performance(self):
"""Get database performance metrics"""
return {
'connections': 25,
'query_time_avg': 0.15,
'slow_queries': 2,
'cache_hit_rate': 94.2
}
def _get_cache_performance(self):
"""Get cache performance metrics"""
return {
'hit_rate': 89.5,
'miss_rate': 10.5,
'evictions': 5,
'memory_usage': 67.3
}
def _get_current_metrics(self):
"""Get current system metrics"""
return {
'cpu_usage': 45.2,
'memory_usage': 67.8,
'active_connections': 25,
'requests_per_minute': 850
}
def _get_model_accuracy(self):
"""Get model accuracy metrics"""
return {
'response_time_model': {'mae': 0.12, 'mse': 0.08},
'error_rate_model': {'mae': 0.05, 'mse': 0.03},
'throughput_model': {'mae': 25.5, 'mse': 18.2}
}# monitoring/alertmanager.yml
global:
smtp_smarthost: 'localhost:587'
smtp_from: 'alerts@aurora-ai.com'
smtp_auth_username: 'alerts@aurora-ai.com'
smtp_auth_password: 'password'
route:
group_by: ['alertname', 'cluster', 'service']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'web.hook'
routes:
- match:
severity: critical
receiver: 'critical-alerts'
- match:
severity: warning
receiver: 'warning-alerts'
- match:
severity: info
receiver: 'info-alerts'
receivers:
- name: 'web.hook'
webhook_configs:
- url: 'http://aurora-web:8080/api/alerts/webhook'
- name: 'critical-alerts'
email_configs:
- to: 'admin@aurora-ai.com'
subject: '[CRITICAL] Aurora AI Alert'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
{{ end }}
slack_configs:
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
channel: '#aurora-alerts-critical'
title: 'Critical Alert: {{ .GroupLabels.alertname }}'
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
- name: 'warning-alerts'
email_configs:
- to: 'team@aurora-ai.com'
subject: '[WARNING] Aurora AI Alert'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
{{ end }}
- name: 'info-alerts'
slack_configs:
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
channel: '#aurora-alerts-info'
title: 'Info: {{ .GroupLabels.alertname }}'
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'cluster', 'service']# monitoring/aurora_rules.yml
groups:
- name: aurora_alerts
rules:
# System Health Alerts
- alert: SystemHealthLow
expr: aurora_system_health_score < 70
for: 5m
labels:
severity: warning
annotations:
summary: "System health score is low"
description: "System health score is {{ $value }} (threshold < 70)"
- alert: SystemHealthCritical
expr: aurora_system_health_score < 50
for: 2m
labels:
severity: critical
annotations:
summary: "System health score is critical"
description: "System health score is {{ $value }} (threshold < 50)"
# Performance Alerts
- alert: HighResponseTime
expr: histogram_quantile(0.95, rate(aurora_api_request_duration_seconds_bucket[5m])) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "High API response time"
description: "95th percentile response time is {{ $value }}s (threshold > 2s)"
- alert: CriticalResponseTime
expr: histogram_quantile(0.95, rate(aurora_api_request_duration_seconds_bucket[5m])) > 5
for: 2m
labels:
severity: critical
annotations:
summary: "Critical API response time"
description: "95th percentile response time is {{ $value }}s (threshold > 5s)"
# Error Rate Alerts
- alert: HighErrorRate
expr: rate(aurora_api_requests_total{status=~"5.."}[5m]) / rate(aurora_api_requests_total[5m]) > 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "High error rate"
description: "Error rate is {{ $value | humanizePercentage }} (threshold > 5%)"
- alert: CriticalErrorRate
expr: rate(aurora_api_requests_total{status=~"5.."}[5m]) / rate(aurora_api_requests_total[5m]) > 0.10
for: 2m
labels:
severity: critical
annotations:
summary: "Critical error rate"
description: "Error rate is {{ $value | humanizePercentage }} (threshold > 10%)"
# Resource Alerts
- alert: HighCPUUsage
expr: rate(container_cpu_usage_seconds_total[5m]) * 100 > 80
for: 10m
labels:
severity: warning
annotations:
summary: "High CPU usage"
description: "CPU usage is {{ $value }}% (threshold > 80%)"
- alert: HighMemoryUsage
expr: container_memory_usage_bytes / container_spec_memory_limit_bytes * 100 > 85
for: 10m
labels:
severity: warning
annotations:
summary: "High memory usage"
description: "Memory usage is {{ $value }}% (threshold > 85%)"
# Business Metrics Alerts
- alert: LowPredictionAccuracy
expr: aurora_model_performance_score < 0.80
for: 15m
labels:
severity: warning
annotations:
summary: "Low model performance"
description: "Model performance score is {{ $value }} (threshold < 80%)"
- alert: NoPredictions
expr: rate(aurora_predictions_total[5m]) == 0
for: 10m
labels:
severity: critical
annotations:
summary: "No predictions being made"
description: "Prediction rate is 0 for the last 5 minutes"Aurora AI Monitoring and Analytics Guide
Enterprise Monitoring • Real-time Analytics • Predictive Intelligence • Alert Management