Skip to content

Latest commit

 

History

History
147 lines (110 loc) · 5.26 KB

File metadata and controls

147 lines (110 loc) · 5.26 KB

Observability AI

🎯 Introduction

AI-powered observability uses machine learning to analyze metrics, logs, and traces at scale, providing insights impossible for humans to derive manually.

📚 Key Capabilities

┌─────────────────────────────────────────────────────────────────────┐
│                    AI-Powered Observability                          │
├─────────────────────────────────────────────────────────────────────┤
│                                                                      │
│  Anomaly Detection        Root Cause Analysis       Prediction      │
│  ├── Statistical          ├── Correlation           ├── Capacity    │
│  ├── ML (Isolation        ├── Dependency           ├── Incident     │
│  │   Forest, AutoEncoder) │   graphs               │   forecasting  │
│  └── Deep Learning        └── Causal inference     └── Load         │
│                                                                      │
│  Log Analysis            Alert Optimization        Natural Language │
│  ├── Clustering          ├── Noise reduction       ├── Log search  │
│  ├── Pattern mining      ├── Deduplication         ├── Chat-based  │
│  └── Sentiment           └── Correlation              queries       │
│                                                                      │
└─────────────────────────────────────────────────────────────────────┘

🔍 Anomaly Detection Implementation

Statistical Approach

import numpy as np
from scipy import stats

class StatisticalAnomalyDetector:
    def __init__(self, window_size=100, threshold=3.0):
        self.window_size = window_size
        self.threshold = threshold
        self.history = []
    
    def detect(self, value):
        self.history.append(value)
        if len(self.history) < self.window_size:
            return False, 0
        
        window = self.history[-self.window_size:]
        mean = np.mean(window)
        std = np.std(window)
        
        z_score = (value - mean) / (std + 1e-10)
        
        return abs(z_score) > self.threshold, z_score

ML-Based Detection

from sklearn.ensemble import IsolationForest
import numpy as np

class MLAnomalyDetector:
    def __init__(self, contamination=0.01):
        self.model = IsolationForest(
            contamination=contamination,
            random_state=42
        )
        self.is_trained = False
    
    def train(self, historical_data):
        """Train on historical normal data."""
        X = np.array(historical_data).reshape(-1, 1)
        self.model.fit(X)
        self.is_trained = True
    
    def detect(self, value):
        if not self.is_trained:
            return False, 0
        
        X = np.array([[value]])
        prediction = self.model.predict(X)
        score = self.model.score_samples(X)[0]
        
        return prediction[0] == -1, score

📊 Log Analysis with AI

Log Clustering

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN

class LogClusterer:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(max_features=1000)
        self.clusterer = DBSCAN(eps=0.3, min_samples=5)
    
    def cluster_logs(self, logs):
        # Vectorize logs
        X = self.vectorizer.fit_transform(logs)
        
        # Cluster
        clusters = self.clusterer.fit_predict(X.toarray())
        
        # Group logs by cluster
        clustered = {}
        for i, cluster_id in enumerate(clusters):
            if cluster_id not in clustered:
                clustered[cluster_id] = []
            clustered[cluster_id].append(logs[i])
        
        return clustered

🎯 Root Cause Analysis

# Simplified RCA using correlation
def find_related_metrics(target_metric, all_metrics, threshold=0.7):
    """Find metrics correlated with the anomalous one."""
    correlations = {}
    
    for name, values in all_metrics.items():
        if name == target_metric['name']:
            continue
        
        corr = np.corrcoef(target_metric['values'], values)[0, 1]
        if abs(corr) > threshold:
            correlations[name] = corr
    
    return sorted(correlations.items(), key=lambda x: abs(x[1]), reverse=True)

✅ Best Practices

  1. Start with baselines: Understand normal before detecting anomalies
  2. Combine approaches: Statistical + ML for robustness
  3. Human in the loop: AI suggests, humans decide
  4. Continuous learning: Retrain on new data
  5. Explainability: Provide reasoning for detections

Next: Learn about ML in Observability.