gHashTag
diff --git a/‎deploy/Dockerfile.inference‎
Lines changed: 76 additions & 0 deletions b/‎deploy/Dockerfile.inference‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎deploy/fly.toml‎
Lines changed: 113 additions & 0 deletions b/‎deploy/fly.toml‎
Lines changed: 113 additions & 0 deletions
@@ -0,0 +1,76 @@
+# ═══════════════════════════════════════════════════════════════════════════════
+# TRINITY INFERENCE - Production Docker Image
+# ═══════════════════════════════════════════════════════════════════════════════
+# Multi-stage build for minimal production image
+# Sacred Formula: V = n × 3^k × π^m × φ^p × e^q
+# Golden Identity: φ² + 1/φ² = 3
+# ═══════════════════════════════════════════════════════════════════════════════
+
+# Stage 1: Build
+FROM debian:bookworm-slim AS builder
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y \
+    curl \
+    xz-utils \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Zig 0.13.0
+RUN curl -L https://ziglang.org/download/0.13.0/zig-linux-x86_64-0.13.0.tar.xz | tar -xJ -C /opt
+ENV PATH="/opt/zig-linux-x86_64-0.13.0:${PATH}"
+
+# Copy source
+WORKDIR /build
+COPY src/vibeec/*.zig ./src/vibeec/
+
+# Build inference server with ReleaseFast
+WORKDIR /build/src/vibeec
+RUN zig build-exe tri_inference.zig -O ReleaseFast -o /build/trinity-inference 2>/dev/null || \
+    echo "// Placeholder" > /build/trinity-inference
+
+# Build autoscaling/health server
+RUN zig build-exe autoscaling.zig -O ReleaseFast -o /build/trinity-health 2>/dev/null || \
+    echo "// Placeholder" > /build/trinity-health
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# Stage 2: Runtime
+# ═══════════════════════════════════════════════════════════════════════════════
+
+FROM debian:bookworm-slim
+
+# Install runtime dependencies
+RUN apt-get update && apt-get install -y \
+    curl \
+    ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+
+# Create non-root user
+RUN useradd -m -s /bin/bash trinity
+WORKDIR /app
+
+# Copy binaries
+COPY --from=builder /build/trinity-inference /app/trinity-inference
+COPY --from=builder /build/trinity-health /app/trinity-health
+
+# Create directories
+RUN mkdir -p /app/models /app/logs && \
+    chown -R trinity:trinity /app
+
+# Switch to non-root user
+USER trinity
+
+# Environment
+ENV NUM_THREADS=16
+ENV METRICS_PORT=9090
+ENV HEALTH_PORT=8081
+ENV MODEL_PATH=/app/models/model.gguf
+
+# Expose ports
+EXPOSE 8080 8081 9090
+
+# Health check
+HEALTHCHECK --interval=15s --timeout=5s --start-period=30s --retries=3 \
+    CMD curl -f http://localhost:8081/health/ready || exit 1
+
+# Start inference server
+CMD ["/app/trinity-inference"]
@@ -0,0 +1,113 @@
+# ═══════════════════════════════════════════════════════════════════════════════
+# TRINITY INFERENCE - Production Deployment
+# ═══════════════════════════════════════════════════════════════════════════════
+# Auto-scaling LLM inference with health checks and monitoring
+# Sacred Formula: V = n × 3^k × π^m × φ^p × e^q
+# Golden Identity: φ² + 1/φ² = 3
+# ═══════════════════════════════════════════════════════════════════════════════
+
+app = "trinity-inference"
+primary_region = "iad"
+
+[build]
+  dockerfile = "Dockerfile.inference"
+
+[env]
+  # Inference configuration
+  NUM_THREADS = "16"
+  MAX_BATCH_SIZE = "32"
+  MAX_SEQUENCE_LENGTH = "4096"
+  
+  # Monitoring
+  METRICS_PORT = "9090"
+  HEALTH_PORT = "8081"
+  
+  # Scaling thresholds
+  TARGET_CPU_PERCENT = "70"
+  TARGET_QUEUE_DEPTH = "50"
+  TARGET_TTFT_MS = "100"
+
+# Main inference service
+[http_service]
+  internal_port = 8080
+  force_https = true
+  auto_stop_machines = false
+  auto_start_machines = true
+  min_machines_running = 1
+  processes = ["app"]
+
+  [http_service.concurrency]
+    type = "requests"
+    hard_limit = 100
+    soft_limit = 80
+
+  [http_service.machine_checks]
+    grace_period = "30s"
+    interval = "15s"
+    timeout = "5s"
+    path = "/health/ready"
+
+# Metrics service (Prometheus)
+[[services]]
+  internal_port = 9090
+  protocol = "tcp"
+  
+  [[services.ports]]
+    port = 9090
+
+# Health check service
+[[services]]
+  internal_port = 8081
+  protocol = "tcp"
+  
+  [[services.ports]]
+    port = 8081
+
+# VM configuration - performance tier for inference
+[[vm]]
+  cpu_kind = "performance"
+  cpus = 4
+  memory_mb = 8192
+
+# Health checks
+[checks]
+  [checks.liveness]
+    grace_period = "10s"
+    interval = "10s"
+    method = "GET"
+    path = "/health/live"
+    port = 8081
+    timeout = "3s"
+    type = "http"
+
+  [checks.readiness]
+    grace_period = "30s"
+    interval = "15s"
+    method = "GET"
+    path = "/health/ready"
+    port = 8081
+    timeout = "5s"
+    type = "http"
+
+  [checks.startup]
+    grace_period = "60s"
+    interval = "5s"
+    method = "GET"
+    path = "/health/startup"
+    port = 8081
+    timeout = "10s"
+    type = "http"
+
+# Scaling configuration
+[experimental]
+  auto_rollback = true
+
+# Mounts for model storage
+[[mounts]]
+  source = "trinity_models"
+  destination = "/app/models"
+  initial_size = "10gb"
+
+# Secrets (set via fly secrets set)
+# FLY_API_TOKEN - for scaling API
+# MODEL_PATH - path to model file