This comprehensive deployment and DevOps guide covers all aspects of deploying, managing, and maintaining the Aurora AI framework in production environments. With 27 integrated systems and 74 API endpoints, this guide provides enterprise-grade deployment strategies and DevOps best practices.
# deployment/architecture.yaml
production_architecture:
load_balancer:
type: "nginx"
ssl_termination: true
health_check_interval: 30
web_servers:
count: 3
instance_type: "t3.medium"
auto_scaling: true
min_instances: 2
max_instances: 10
database:
type: "postgresql"
instance_type: "db.r6g.large"
multi_az: true
backup_retention: 30
read_replicas: 2
cache:
type: "redis"
instance_type: "cache.r6g.large"
cluster_mode: true
num_cache_clusters: 3
storage:
type: "s3"
backup_bucket: "aurora-ai-backups"
data_bucket: "aurora-ai-data"
encryption: true# Dockerfile
FROM python:3.9-slim
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y \
gcc \
g++ \
libpq-dev \
&& rm -rf /var/lib/apt/lists/*
# Copy requirements and install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy application code
COPY . .
# Create non-root user
RUN useradd --create-home --shell /bin/bash aurora
USER aurora
# Expose port
EXPOSE 8080
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8080/api/health || exit 1
# Start application
CMD ["python", "web_backend/server.py"]# docker-compose.yml
version: '3.8'
services:
aurora-web:
build: .
ports:
- "8080:8080"
environment:
- AURORA_ENV=production
- DB_HOST=aurora-db
- REDIS_HOST=aurora-redis
depends_on:
- aurora-db
- aurora-redis
volumes:
- ./logs:/app/logs
- ./data:/app/data
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8080/api/health"]
interval: 30s
timeout: 10s
retries: 3
aurora-db:
image: postgres:13
environment:
- POSTGRES_DB=aurora_ai
- POSTGRES_USER=aurora_user
- POSTGRES_PASSWORD=${DB_PASSWORD}
volumes:
- postgres_data:/var/lib/postgresql/data
- ./scripts/init.sql:/docker-entrypoint-initdb.d/init.sql
restart: unless-stopped
healthcheck:
test: ["CMD-SHELL", "pg_isready -U aurora_user -d aurora_ai"]
interval: 10s
timeout: 5s
retries: 5
aurora-redis:
image: redis:6-alpine
command: redis-server --appendonly yes
volumes:
- redis_data:/data
restart: unless-stopped
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 10s
timeout: 5s
retries: 3
nginx:
image: nginx:alpine
ports:
- "80:80"
- "443:443"
volumes:
- ./nginx/nginx.conf:/etc/nginx/nginx.conf
- ./nginx/ssl:/etc/nginx/ssl
depends_on:
- aurora-web
restart: unless-stopped
volumes:
postgres_data:
redis_data:# k8s/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: aurora-ai
labels:
name: aurora-ai
---
# k8s/configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: aurora-config
namespace: aurora-ai
data:
aurora.env: |
AURORA_ENV=production
LOG_LEVEL=INFO
WORKERS=4
---
# k8s/secret.yaml
apiVersion: v1
kind: Secret
metadata:
name: aurora-secrets
namespace: aurora-ai
type: Opaque
data:
db-password: <base64-encoded-password>
jwt-secret: <base64-encoded-secret>
---
# k8s/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: aurora-web
namespace: aurora-ai
labels:
app: aurora-web
spec:
replicas: 3
selector:
matchLabels:
app: aurora-web
template:
metadata:
labels:
app: aurora-web
spec:
containers:
- name: aurora-web
image: aurora-ai:latest
ports:
- containerPort: 8080
env:
- name: AURORA_ENV
valueFrom:
configMapKeyRef:
name: aurora-config
key: aurora.env
- name: DB_PASSWORD
valueFrom:
secretKeyRef:
name: aurora-secrets
key: db-password
resources:
requests:
memory: "512Mi"
cpu: "250m"
limits:
memory: "1Gi"
cpu: "500m"
livenessProbe:
httpGet:
path: /api/health
port: 8080
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /api/status
port: 8080
initialDelaySeconds: 5
periodSeconds: 5
---
# k8s/service.yaml
apiVersion: v1
kind: Service
metadata:
name: aurora-web-service
namespace: aurora-ai
spec:
selector:
app: aurora-web
ports:
- protocol: TCP
port: 80
targetPort: 8080
type: LoadBalancer
---
# k8s/ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: aurora-ingress
namespace: aurora-ai
annotations:
kubernetes.io/ingress.class: nginx
cert-manager.io/cluster-issuer: letsencrypt-prod
nginx.ingress.kubernetes.io/ssl-redirect: "true"
spec:
tls:
- hosts:
- aurora-ai.example.com
secretName: aurora-tls
rules:
- host: aurora-ai.example.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: aurora-web-service
port:
number: 80# k8s/hpa.yaml
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: aurora-web-hpa
namespace: aurora-ai
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: aurora-web
minReplicas: 2
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80# .github/workflows/deploy.yml
name: Deploy Aurora AI
on:
push:
branches: [main, develop]
pull_request:
branches: [main]
env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
jobs:
test:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.8, 3.9, 3.10]
services:
postgres:
image: postgres:13
env:
POSTGRES_PASSWORD: postgres
POSTGRES_DB: aurora_test
options: >-
--health-cmd pg_isready
--health-interval 10s
--health-timeout 5s
--health-retries 5
ports:
- 5432:5432
redis:
image: redis:6
options: >-
--health-cmd "redis-cli ping"
--health-interval 10s
--health-timeout 5s
--health-retries 5
ports:
- 6379:6379
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Cache dependencies
uses: actions/cache@v3
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements*.txt') }}
restore-keys: |
${{ runner.os }}-pip-
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install pytest pytest-cov flake8
- name: Lint with flake8
run: |
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
- name: Run unit tests
run: |
pytest tests/unit/ --cov=aurora --cov-report=xml
- name: Run integration tests
run: |
pytest tests/integration/ --cov-append --cov-report=xml
env:
DB_HOST: localhost
DB_PORT: 5432
DB_NAME: aurora_test
DB_USER: postgres
DB_PASSWORD: postgres
REDIS_HOST: localhost
REDIS_PORT: 6379
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
with:
file: ./coverage.xml
build-and-push:
needs: test
runs-on: ubuntu-latest
if: github.event_name == 'push'
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Log in to Container Registry
uses: docker/login-action@v2
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata
id: meta
uses: docker/metadata-action@v4
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
- name: Build and push Docker image
uses: docker/build-push-action@v4
with:
context: .
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max
deploy-staging:
needs: build-and-push
runs-on: ubuntu-latest
if: github.ref == 'refs/heads/develop'
environment: staging
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Configure kubectl
uses: azure/k8s-set-context@v1
with:
method: kubeconfig
kubeconfig: ${{ secrets.KUBE_CONFIG_STAGING }}
- name: Deploy to staging
run: |
kubectl apply -f k8s/
kubectl set image deployment/aurora-web aurora-web=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.sha }}
kubectl rollout status deployment/aurora-web
deploy-production:
needs: build-and-push
runs-on: ubuntu-latest
if: github.ref == 'refs/heads/main'
environment: production
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Configure kubectl
uses: azure/k8s-set-context@v1
with:
method: kubeconfig
kubeconfig: ${{ secrets.KUBE_CONFIG_PROD }}
- name: Deploy to production
run: |
kubectl apply -f k8s/
kubectl set image deployment/aurora-web aurora-web=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.sha }}
kubectl rollout status deployment/aurora-web# monitoring/prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- "aurora_rules.yml"
scrape_configs:
- job_name: 'aurora-web'
static_configs:
- targets: ['aurora-web:8080']
metrics_path: '/api/monitoring/metrics'
scrape_interval: 30s
- job_name: 'postgres'
static_configs:
- targets: ['postgres-exporter:9187']
- job_name: 'redis'
static_configs:
- targets: ['redis-exporter:9121']
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093{
"dashboard": {
"title": "Aurora AI System Dashboard",
"panels": [
{
"title": "API Response Time",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))",
"legendFormat": "95th percentile"
},
{
"expr": "histogram_quantile(0.50, rate(http_request_duration_seconds_bucket[5m]))",
"legendFormat": "50th percentile"
}
]
},
{
"title": "Request Rate",
"type": "graph",
"targets": [
{
"expr": "rate(http_requests_total[5m])",
"legendFormat": "Requests/sec"
}
]
},
{
"title": "Error Rate",
"type": "graph",
"targets": [
{
"expr": "rate(http_requests_total{status=~\"5..\"}[5m]) / rate(http_requests_total[5m])",
"legendFormat": "Error Rate"
}
]
},
{
"title": "System Resources",
"type": "graph",
"targets": [
{
"expr": "rate(container_cpu_usage_seconds_total[5m])",
"legendFormat": "CPU Usage"
},
{
"expr": "container_memory_usage_bytes / 1024 / 1024",
"legendFormat": "Memory Usage (MB)"
}
]
}
]
}
}# main.tf
provider "aws" {
region = var.aws_region
}
# VPC Configuration
resource "aws_vpc" "aurora_vpc" {
cidr_block = "10.0.0.0/16"
enable_dns_hostnames = true
enable_dns_support = true
tags = {
Name = "aurora-ai-vpc"
}
}
# Subnet Configuration
resource "aws_subnet" "public" {
count = 2
vpc_id = aws_vpc.aurora_vpc.id
cidr_block = "10.0.${count.index + 1}.0.0/24"
availability_zone = data.aws_availability_zones.available.names[count.index]
tags = {
Name = "aurora-ai-public-${count.index + 1}"
}
}
resource "aws_subnet" "private" {
count = 2
vpc_id = aws_vpc.aurora_vpc.id
cidr_block = "10.0.${count.index + 3}.0.0/24"
availability_zone = data.aws_availability_zones.available.names[count.index]
tags = {
Name = "aurora-ai-private-${count.index + 1}"
}
}
# EKS Cluster
resource "aws_eks_cluster" "aurora_cluster" {
name = "aurora-ai-cluster"
role_arn = aws_iam_role.eks_cluster.arn
vpc_config {
subnet_ids = concat(aws_subnet.public[*].id, aws_subnet.private[*].id)
}
depends_on = [
aws_iam_role_policy_attachment.eks_cluster_policy,
]
}
# RDS Database
resource "aws_db_instance" "aurora_db" {
identifier = "aurora-ai-db"
engine = "postgres"
engine_version = "13.7"
instance_class = "db.r6g.large"
allocated_storage = 100
max_allocated_storage = 1000
storage_type = "gp2"
storage_encrypted = true
db_name = "aurora_ai"
username = "aurora_user"
password = var.db_password
vpc_security_group_ids = [aws_security_group.db.id]
db_subnet_group_name = aws_db_subnet_group.aurora_db.name
backup_retention_period = 30
backup_window = "03:00-04:00"
maintenance_window = "sun:04:00-sun:05:00"
skip_final_snapshot = true
tags = {
Name = "aurora-ai-database"
}
}
# ElastiCache Redis
resource "aws_elasticache_subnet_group" "aurora_redis_subnet" {
name = "aurora-redis-subnet"
subnet_ids = aws_subnet.private[*].id
}
resource "aws_elasticache_cluster" "aurora_redis" {
cluster_id = "aurora-redis"
engine = "redis"
node_type = "cache.r6g.large"
num_cache_nodes = 3
parameter_group_name = "default.redis6.x"
port = 6379
subnet_group_name = aws_elasticache_subnet_group.aurora_redis_subnet.name
security_group_ids = [aws_security_group.redis.id]
tags = {
Name = "aurora-ai-redis"
}
}#!/bin/bash
# scripts/deploy.sh
set -e
ENVIRONMENT=${1:-staging}
VERSION=${2:-latest}
REGION=${3:-us-east-1}
echo "Deploying Aurora AI to $ENVIRONMENT environment"
echo "Version: $VERSION"
echo "Region: $REGION"
# Check prerequisites
command -v kubectl >/dev/null 2>&1 || { echo "kubectl is required but not installed." >&2; exit 1; }
command -v helm >/dev/null 2>&1 || { echo "helm is required but not installed." >&2; exit 1; }
# Set kubectl context
kubectl config use-context $ENVIRONMENT
# Update deployment
echo "Updating deployment..."
kubectl set image deployment/aurora-web aurora-web=aurora-ai:$VERSION
# Wait for rollout
echo "Waiting for rollout to complete..."
kubectl rollout status deployment/aurora-web --timeout=300s
# Verify deployment
echo "Verifying deployment..."
kubectl get pods -l app=aurora-web
kubectl get services
# Run health check
echo "Running health check..."
kubectl exec deployment/aurora-web -- curl -f http://localhost:8080/api/health
echo "Deployment completed successfully!"#!/bin/bash
# scripts/rollback.sh
set -e
ENVIRONMENT=${1:-staging}
REVISION=${2:-previous}
echo "Rolling back Aurora AI in $ENVIRONMENT environment to revision $REVISION"
# Set kubectl context
kubectl config use-context $ENVIRONMENT
# Rollback deployment
echo "Rolling back deployment..."
kubectl rollout undo deployment/aurora-web --to-revision=$REVISION
# Wait for rollback
echo "Waiting for rollback to complete..."
kubectl rollout status deployment/aurora-web --timeout=300s
# Verify rollback
echo "Verifying rollback..."
kubectl get pods -l app=aurora-web
kubectl get services
# Run health check
echo "Running health check..."
kubectl exec deployment/aurora-web -- curl -f http://localhost:8080/api/health
echo "Rollback completed successfully!"- Environment variables configured
- Database connections tested
- Cache connections tested
- SSL certificates valid
- Load balancer configured
- Monitoring systems active
- Backup procedures verified
- Security scans completed
- Performance tests passed
- Documentation updated
- All pods running healthy
- Services accessible
- Database connections working
- Cache connections working
- API endpoints responding
- Monitoring metrics collecting
- Alert rules active
- Backup jobs scheduled
- Log aggregation working
- Staging deployment successful
- All tests passed in staging
- Performance benchmarks met
- Security audit completed
- Disaster recovery tested
- Monitoring dashboards configured
- Alert notifications tested
- Documentation updated
- Team notified
- Rollback plan ready
#!/bin/bash
# scripts/emergency-rollback.sh
echo "Emergency rollback initiated..."
# Get current deployment status
kubectl rollout status deployment/aurora-web
# Rollback to previous stable version
kubectl rollout undo deployment/aurora-web
# Wait for rollback completion
kubectl rollout status deployment/aurora-web --timeout=300s
# Verify system health
kubectl exec deployment/aurora-web -- curl -f http://localhost:8080/api/health
# Notify team
curl -X POST "https://hooks.slack.com/your-webhook" \
-H 'Content-type: application/json' \
-d '{"text":"Emergency rollback completed for Aurora AI"}'
echo "Emergency rollback completed!"Aurora AI Deployment and DevOps Guide
Enterprise Deployment • CI/CD • Kubernetes • Infrastructure as Code