Skip to content

Latest commit

 

History

History
843 lines (718 loc) · 18.8 KB

File metadata and controls

843 lines (718 loc) · 18.8 KB

Aurora AI Framework - Deployment and DevOps Guide

🌟 Overview

This comprehensive deployment and DevOps guide covers all aspects of deploying, managing, and maintaining the Aurora AI framework in production environments. With 27 integrated systems and 74 API endpoints, this guide provides enterprise-grade deployment strategies and DevOps best practices.

🚀 Deployment Strategies

Deployment Architecture

# deployment/architecture.yaml
production_architecture:
  load_balancer:
    type: "nginx"
    ssl_termination: true
    health_check_interval: 30
  
  web_servers:
    count: 3
    instance_type: "t3.medium"
    auto_scaling: true
    min_instances: 2
    max_instances: 10
  
  database:
    type: "postgresql"
    instance_type: "db.r6g.large"
    multi_az: true
    backup_retention: 30
    read_replicas: 2
  
  cache:
    type: "redis"
    instance_type: "cache.r6g.large"
    cluster_mode: true
    num_cache_clusters: 3
  
  storage:
    type: "s3"
    backup_bucket: "aurora-ai-backups"
    data_bucket: "aurora-ai-data"
    encryption: true

Container Deployment

# Dockerfile
FROM python:3.9-slim

WORKDIR /app

# Install system dependencies
RUN apt-get update && apt-get install -y \
    gcc \
    g++ \
    libpq-dev \
    && rm -rf /var/lib/apt/lists/*

# Copy requirements and install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

# Copy application code
COPY . .

# Create non-root user
RUN useradd --create-home --shell /bin/bash aurora
USER aurora

# Expose port
EXPOSE 8080

# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
  CMD curl -f http://localhost:8080/api/health || exit 1

# Start application
CMD ["python", "web_backend/server.py"]

Docker Compose Configuration

# docker-compose.yml
version: '3.8'

services:
  aurora-web:
    build: .
    ports:
      - "8080:8080"
    environment:
      - AURORA_ENV=production
      - DB_HOST=aurora-db
      - REDIS_HOST=aurora-redis
    depends_on:
      - aurora-db
      - aurora-redis
    volumes:
      - ./logs:/app/logs
      - ./data:/app/data
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8080/api/health"]
      interval: 30s
      timeout: 10s
      retries: 3

  aurora-db:
    image: postgres:13
    environment:
      - POSTGRES_DB=aurora_ai
      - POSTGRES_USER=aurora_user
      - POSTGRES_PASSWORD=${DB_PASSWORD}
    volumes:
      - postgres_data:/var/lib/postgresql/data
      - ./scripts/init.sql:/docker-entrypoint-initdb.d/init.sql
    restart: unless-stopped
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U aurora_user -d aurora_ai"]
      interval: 10s
      timeout: 5s
      retries: 5

  aurora-redis:
    image: redis:6-alpine
    command: redis-server --appendonly yes
    volumes:
      - redis_data:/data
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "redis-cli", "ping"]
      interval: 10s
      timeout: 5s
      retries: 3

  nginx:
    image: nginx:alpine
    ports:
      - "80:80"
      - "443:443"
    volumes:
      - ./nginx/nginx.conf:/etc/nginx/nginx.conf
      - ./nginx/ssl:/etc/nginx/ssl
    depends_on:
      - aurora-web
    restart: unless-stopped

volumes:
  postgres_data:
  redis_data:

🔧 Kubernetes Deployment

Kubernetes Manifests

# k8s/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
  name: aurora-ai
  labels:
    name: aurora-ai

---
# k8s/configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: aurora-config
  namespace: aurora-ai
data:
  aurora.env: |
    AURORA_ENV=production
    LOG_LEVEL=INFO
    WORKERS=4

---
# k8s/secret.yaml
apiVersion: v1
kind: Secret
metadata:
  name: aurora-secrets
  namespace: aurora-ai
type: Opaque
data:
  db-password: <base64-encoded-password>
  jwt-secret: <base64-encoded-secret>

---
# k8s/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: aurora-web
  namespace: aurora-ai
  labels:
    app: aurora-web
spec:
  replicas: 3
  selector:
    matchLabels:
      app: aurora-web
  template:
    metadata:
      labels:
        app: aurora-web
    spec:
      containers:
      - name: aurora-web
        image: aurora-ai:latest
        ports:
        - containerPort: 8080
        env:
        - name: AURORA_ENV
          valueFrom:
            configMapKeyRef:
              name: aurora-config
              key: aurora.env
        - name: DB_PASSWORD
          valueFrom:
            secretKeyRef:
              name: aurora-secrets
              key: db-password
        resources:
          requests:
            memory: "512Mi"
            cpu: "250m"
          limits:
            memory: "1Gi"
            cpu: "500m"
        livenessProbe:
          httpGet:
            path: /api/health
            port: 8080
          initialDelaySeconds: 30
          periodSeconds: 10
        readinessProbe:
          httpGet:
            path: /api/status
            port: 8080
          initialDelaySeconds: 5
          periodSeconds: 5

---
# k8s/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: aurora-web-service
  namespace: aurora-ai
spec:
  selector:
    app: aurora-web
  ports:
  - protocol: TCP
    port: 80
    targetPort: 8080
  type: LoadBalancer

---
# k8s/ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: aurora-ingress
  namespace: aurora-ai
  annotations:
    kubernetes.io/ingress.class: nginx
    cert-manager.io/cluster-issuer: letsencrypt-prod
    nginx.ingress.kubernetes.io/ssl-redirect: "true"
spec:
  tls:
  - hosts:
    - aurora-ai.example.com
    secretName: aurora-tls
  rules:
  - host: aurora-ai.example.com
    http:
      paths:
      - path: /
        pathType: Prefix
        backend:
          service:
            name: aurora-web-service
            port:
              number: 80

Horizontal Pod Autoscaler

# k8s/hpa.yaml
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: aurora-web-hpa
  namespace: aurora-ai
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: aurora-web
  minReplicas: 2
  maxReplicas: 10
  metrics:
  - type: Resource
    resource:
      name: cpu
      target:
        type: Utilization
        averageUtilization: 70
  - type: Resource
    resource:
      name: memory
      target:
        type: Utilization
        averageUtilization: 80

🔒 CI/CD Pipeline

GitHub Actions Workflow

# .github/workflows/deploy.yml
name: Deploy Aurora AI

on:
  push:
    branches: [main, develop]
  pull_request:
    branches: [main]

env:
  REGISTRY: ghcr.io
  IMAGE_NAME: ${{ github.repository }}

jobs:
  test:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: [3.8, 3.9, 3.10]
    
    services:
      postgres:
        image: postgres:13
        env:
          POSTGRES_PASSWORD: postgres
          POSTGRES_DB: aurora_test
        options: >-
          --health-cmd pg_isready
          --health-interval 10s
          --health-timeout 5s
          --health-retries 5
        ports:
          - 5432:5432
      
      redis:
        image: redis:6
        options: >-
          --health-cmd "redis-cli ping"
          --health-interval 10s
          --health-timeout 5s
          --health-retries 5
        ports:
          - 6379:6379

    steps:
    - name: Checkout code
      uses: actions/checkout@v3
    
    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v4
      with:
        python-version: ${{ matrix.python-version }}
    
    - name: Cache dependencies
      uses: actions/cache@v3
      with:
        path: ~/.cache/pip
        key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements*.txt') }}
        restore-keys: |
          ${{ runner.os }}-pip-
    
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        pip install -r requirements.txt
        pip install pytest pytest-cov flake8
    
    - name: Lint with flake8
      run: |
        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
    
    - name: Run unit tests
      run: |
        pytest tests/unit/ --cov=aurora --cov-report=xml
    
    - name: Run integration tests
      run: |
        pytest tests/integration/ --cov-append --cov-report=xml
      env:
        DB_HOST: localhost
        DB_PORT: 5432
        DB_NAME: aurora_test
        DB_USER: postgres
        DB_PASSWORD: postgres
        REDIS_HOST: localhost
        REDIS_PORT: 6379
    
    - name: Upload coverage to Codecov
      uses: codecov/codecov-action@v3
      with:
        file: ./coverage.xml

  build-and-push:
    needs: test
    runs-on: ubuntu-latest
    if: github.event_name == 'push'
    
    steps:
    - name: Checkout code
      uses: actions/checkout@v3
    
    - name: Log in to Container Registry
      uses: docker/login-action@v2
      with:
        registry: ${{ env.REGISTRY }}
        username: ${{ github.actor }}
        password: ${{ secrets.GITHUB_TOKEN }}
    
    - name: Extract metadata
      id: meta
      uses: docker/metadata-action@v4
      with:
        images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
    
    - name: Build and push Docker image
      uses: docker/build-push-action@v4
      with:
        context: .
        push: true
        tags: ${{ steps.meta.outputs.tags }}
        labels: ${{ steps.meta.outputs.labels }}
        cache-from: type=gha
        cache-to: type=gha,mode=max

  deploy-staging:
    needs: build-and-push
    runs-on: ubuntu-latest
    if: github.ref == 'refs/heads/develop'
    environment: staging
    
    steps:
    - name: Checkout code
      uses: actions/checkout@v3
    
    - name: Configure kubectl
      uses: azure/k8s-set-context@v1
      with:
        method: kubeconfig
        kubeconfig: ${{ secrets.KUBE_CONFIG_STAGING }}
    
    - name: Deploy to staging
      run: |
        kubectl apply -f k8s/
        kubectl set image deployment/aurora-web aurora-web=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.sha }}
        kubectl rollout status deployment/aurora-web

  deploy-production:
    needs: build-and-push
    runs-on: ubuntu-latest
    if: github.ref == 'refs/heads/main'
    environment: production
    
    steps:
    - name: Checkout code
      uses: actions/checkout@v3
    
    - name: Configure kubectl
      uses: azure/k8s-set-context@v1
      with:
        method: kubeconfig
        kubeconfig: ${{ secrets.KUBE_CONFIG_PROD }}
    
    - name: Deploy to production
      run: |
        kubectl apply -f k8s/
        kubectl set image deployment/aurora-web aurora-web=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.sha }}
        kubectl rollout status deployment/aurora-web

📊 Monitoring and Observability

Prometheus Configuration

# monitoring/prometheus.yml
global:
  scrape_interval: 15s
  evaluation_interval: 15s

rule_files:
  - "aurora_rules.yml"

scrape_configs:
  - job_name: 'aurora-web'
    static_configs:
      - targets: ['aurora-web:8080']
    metrics_path: '/api/monitoring/metrics'
    scrape_interval: 30s
    
  - job_name: 'postgres'
    static_configs:
      - targets: ['postgres-exporter:9187']
    
  - job_name: 'redis'
    static_configs:
      - targets: ['redis-exporter:9121']

alerting:
  alertmanagers:
    - static_configs:
      - targets:
        - alertmanager:9093

Grafana Dashboard

{
  "dashboard": {
    "title": "Aurora AI System Dashboard",
    "panels": [
      {
        "title": "API Response Time",
        "type": "graph",
        "targets": [
          {
            "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))",
            "legendFormat": "95th percentile"
          },
          {
            "expr": "histogram_quantile(0.50, rate(http_request_duration_seconds_bucket[5m]))",
            "legendFormat": "50th percentile"
          }
        ]
      },
      {
        "title": "Request Rate",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(http_requests_total[5m])",
            "legendFormat": "Requests/sec"
          }
        ]
      },
      {
        "title": "Error Rate",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(http_requests_total{status=~\"5..\"}[5m]) / rate(http_requests_total[5m])",
            "legendFormat": "Error Rate"
          }
        ]
      },
      {
        "title": "System Resources",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(container_cpu_usage_seconds_total[5m])",
            "legendFormat": "CPU Usage"
          },
          {
            "expr": "container_memory_usage_bytes / 1024 / 1024",
            "legendFormat": "Memory Usage (MB)"
          }
        ]
      }
    ]
  }
}

🔧 Infrastructure as Code

Terraform Configuration

# main.tf
provider "aws" {
  region = var.aws_region
}

# VPC Configuration
resource "aws_vpc" "aurora_vpc" {
  cidr_block           = "10.0.0.0/16"
  enable_dns_hostnames = true
  enable_dns_support   = true

  tags = {
    Name = "aurora-ai-vpc"
  }
}

# Subnet Configuration
resource "aws_subnet" "public" {
  count             = 2
  vpc_id            = aws_vpc.aurora_vpc.id
  cidr_block        = "10.0.${count.index + 1}.0.0/24"
  availability_zone = data.aws_availability_zones.available.names[count.index]

  tags = {
    Name = "aurora-ai-public-${count.index + 1}"
  }
}

resource "aws_subnet" "private" {
  count             = 2
  vpc_id            = aws_vpc.aurora_vpc.id
  cidr_block        = "10.0.${count.index + 3}.0.0/24"
  availability_zone = data.aws_availability_zones.available.names[count.index]

  tags = {
    Name = "aurora-ai-private-${count.index + 1}"
  }
}

# EKS Cluster
resource "aws_eks_cluster" "aurora_cluster" {
  name     = "aurora-ai-cluster"
  role_arn = aws_iam_role.eks_cluster.arn
  vpc_config {
    subnet_ids = concat(aws_subnet.public[*].id, aws_subnet.private[*].id)
  }

  depends_on = [
    aws_iam_role_policy_attachment.eks_cluster_policy,
  ]
}

# RDS Database
resource "aws_db_instance" "aurora_db" {
  identifier = "aurora-ai-db"
  
  engine         = "postgres"
  engine_version  = "13.7"
  instance_class = "db.r6g.large"
  
  allocated_storage     = 100
  max_allocated_storage = 1000
  storage_type          = "gp2"
  storage_encrypted     = true
  
  db_name  = "aurora_ai"
  username = "aurora_user"
  password = var.db_password
  
  vpc_security_group_ids = [aws_security_group.db.id]
  db_subnet_group_name   = aws_db_subnet_group.aurora_db.name
  
  backup_retention_period = 30
  backup_window          = "03:00-04:00"
  maintenance_window     = "sun:04:00-sun:05:00"
  
  skip_final_snapshot = true
  
  tags = {
    Name = "aurora-ai-database"
  }
}

# ElastiCache Redis
resource "aws_elasticache_subnet_group" "aurora_redis_subnet" {
  name       = "aurora-redis-subnet"
  subnet_ids = aws_subnet.private[*].id
}

resource "aws_elasticache_cluster" "aurora_redis" {
  cluster_id           = "aurora-redis"
  engine               = "redis"
  node_type            = "cache.r6g.large"
  num_cache_nodes      = 3
  parameter_group_name = "default.redis6.x"
  port                 = 6379
  subnet_group_name    = aws_elasticache_subnet_group.aurora_redis_subnet.name
  security_group_ids  = [aws_security_group.redis.id]
  
  tags = {
    Name = "aurora-ai-redis"
  }
}

🔄 Deployment Automation

Deployment Scripts

#!/bin/bash
# scripts/deploy.sh

set -e

ENVIRONMENT=${1:-staging}
VERSION=${2:-latest}
REGION=${3:-us-east-1}

echo "Deploying Aurora AI to $ENVIRONMENT environment"
echo "Version: $VERSION"
echo "Region: $REGION"

# Check prerequisites
command -v kubectl >/dev/null 2>&1 || { echo "kubectl is required but not installed." >&2; exit 1; }
command -v helm >/dev/null 2>&1 || { echo "helm is required but not installed." >&2; exit 1; }

# Set kubectl context
kubectl config use-context $ENVIRONMENT

# Update deployment
echo "Updating deployment..."
kubectl set image deployment/aurora-web aurora-web=aurora-ai:$VERSION

# Wait for rollout
echo "Waiting for rollout to complete..."
kubectl rollout status deployment/aurora-web --timeout=300s

# Verify deployment
echo "Verifying deployment..."
kubectl get pods -l app=aurora-web
kubectl get services

# Run health check
echo "Running health check..."
kubectl exec deployment/aurora-web -- curl -f http://localhost:8080/api/health

echo "Deployment completed successfully!"

Rollback Script

#!/bin/bash
# scripts/rollback.sh

set -e

ENVIRONMENT=${1:-staging}
REVISION=${2:-previous}

echo "Rolling back Aurora AI in $ENVIRONMENT environment to revision $REVISION"

# Set kubectl context
kubectl config use-context $ENVIRONMENT

# Rollback deployment
echo "Rolling back deployment..."
kubectl rollout undo deployment/aurora-web --to-revision=$REVISION

# Wait for rollback
echo "Waiting for rollback to complete..."
kubectl rollout status deployment/aurora-web --timeout=300s

# Verify rollback
echo "Verifying rollback..."
kubectl get pods -l app=aurora-web
kubectl get services

# Run health check
echo "Running health check..."
kubectl exec deployment/aurora-web -- curl -f http://localhost:8080/api/health

echo "Rollback completed successfully!"

📋 Deployment Checklist

Pre-Deployment Checklist

  • Environment variables configured
  • Database connections tested
  • Cache connections tested
  • SSL certificates valid
  • Load balancer configured
  • Monitoring systems active
  • Backup procedures verified
  • Security scans completed
  • Performance tests passed
  • Documentation updated

Post-Deployment Verification

  • All pods running healthy
  • Services accessible
  • Database connections working
  • Cache connections working
  • API endpoints responding
  • Monitoring metrics collecting
  • Alert rules active
  • Backup jobs scheduled
  • Log aggregation working

Production Deployment Checklist

  • Staging deployment successful
  • All tests passed in staging
  • Performance benchmarks met
  • Security audit completed
  • Disaster recovery tested
  • Monitoring dashboards configured
  • Alert notifications tested
  • Documentation updated
  • Team notified
  • Rollback plan ready

🚨 Emergency Procedures

Deployment Failure Recovery

#!/bin/bash
# scripts/emergency-rollback.sh

echo "Emergency rollback initiated..."

# Get current deployment status
kubectl rollout status deployment/aurora-web

# Rollback to previous stable version
kubectl rollout undo deployment/aurora-web

# Wait for rollback completion
kubectl rollout status deployment/aurora-web --timeout=300s

# Verify system health
kubectl exec deployment/aurora-web -- curl -f http://localhost:8080/api/health

# Notify team
curl -X POST "https://hooks.slack.com/your-webhook" \
  -H 'Content-type: application/json' \
  -d '{"text":"Emergency rollback completed for Aurora AI"}'

echo "Emergency rollback completed!"

Aurora AI Deployment and DevOps Guide
Enterprise Deployment • CI/CD • Kubernetes • Infrastructure as Code