ServerlessLLM
diff --git a/‎.github/workflows/pages.yml‎
Lines changed: 55 additions & 0 deletions b/‎.github/workflows/pages.yml‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 5 additions & 0 deletions b/‎.gitignore‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 45 additions & 1 deletion b/‎README.md‎
Lines changed: 45 additions & 1 deletion
diff --git a/‎docs/.vitepress/config.mts‎
Lines changed: 74 additions & 0 deletions b/‎docs/.vitepress/config.mts‎
Lines changed: 74 additions & 0 deletions
diff --git a/‎docs/advanced/architecture.md‎
Lines changed: 189 additions & 0 deletions b/‎docs/advanced/architecture.md‎
Lines changed: 189 additions & 0 deletions
@@ -0,0 +1,55 @@
+name: Deploy to GitHub Pages
+
+on:
+  push:
+    branches: ["main"]
+  workflow_dispatch:
+
+permissions:
+  contents: read
+  pages: write
+  id-token: write
+
+concurrency:
+  group: "pages"
+  cancel-in-progress: false
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: 20
+          cache: npm
+
+      - name: Install dependencies
+        run: npm ci
+
+      - name: Build with VitePress
+        run: npm run docs:build
+
+      - name: Setup Pages
+        uses: actions/configure-pages@v5
+
+      - name: Upload artifact
+        uses: actions/upload-pages-artifact@v3
+        with:
+          path: docs/.vitepress/dist
+
+  deploy:
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    runs-on: ubuntu-latest
+    needs: build
+    steps:
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@v4
@@ -0,0 +1,5 @@
+node_modules/
+docs/.vitepress/dist/
+docs/.vitepress/cache/
+*.log
+.DS_Store
@@ -1 +1,45 @@
-# pylet.github.io
+# PyLet Documentation
+
+Documentation site for [PyLet](https://github.com/ServerlessLLM/pylet) — a simple distributed task execution system for GPU servers. Built with [VitePress](https://vitepress.dev/).
+
+**Live site**: https://serverlessllm.github.io/pylet.github.io
+
+## Local Development
+
+```bash
+# Install dependencies
+npm install
+
+# Start dev server with hot reload
+npm run docs:dev
+# Open http://localhost:5173
+
+# Build for production
+npm run docs:build
+
+# Preview production build
+npm run docs:preview
+```
+
+## Deployment
+
+Auto-deploys to GitHub Pages on push to `main` via `.github/workflows/pages.yml`.
+
+## Structure
+
+```
+docs/
+├── index.md                      # Landing page
+├── getting-started/
+│   └── quickstart.md             # Step-by-step getting started
+├── guide/
+│   ├── concepts.md               # Instances, workers, resources
+│   ├── configuration.md          # TOML config & env vars
+│   └── examples.md               # Practical recipes
+├── reference/
+│   ├── cli.md                    # All pylet commands
+│   └── python-api.md             # import pylet reference
+└── advanced/
+    ├── architecture.md           # How PyLet works internally
+    └── troubleshooting.md        # Common issues & fixes
+```
@@ -0,0 +1,74 @@
+import { defineConfig } from 'vitepress'
+
+export default defineConfig({
+  title: 'PyLet',
+  description: 'A simple distributed task execution system for GPU servers',
+  base: '/pylet.github.io/',
+  
+  head: [
+    ['link', { rel: 'icon', type: 'image/svg+xml', href: '/pylet.github.io/logo.svg' }],
+  ],
+
+  themeConfig: {
+    logo: '/logo.svg',
+
+    nav: [
+      { text: 'Guide', link: '/getting-started/quickstart' },
+      { text: 'Reference', link: '/reference/cli' },
+      { text: 'PyPI', link: 'https://pypi.org/project/pylet/' },
+    ],
+
+    sidebar: [
+      {
+        text: 'Getting Started',
+        items: [
+          { text: 'Quick Start', link: '/getting-started/quickstart' },
+        ],
+      },
+      {
+        text: 'Guide',
+        items: [
+          { text: 'Core Concepts', link: '/guide/concepts' },
+          { text: 'Configuration', link: '/guide/configuration' },
+          { text: 'Examples', link: '/guide/examples' },
+        ],
+      },
+      {
+        text: 'Reference',
+        items: [
+          { text: 'CLI Reference', link: '/reference/cli' },
+          { text: 'Python API', link: '/reference/python-api' },
+        ],
+      },
+      {
+        text: 'Advanced',
+        items: [
+          { text: 'Architecture', link: '/advanced/architecture' },
+          { text: 'Troubleshooting', link: '/advanced/troubleshooting' },
+        ],
+      },
+    ],
+
+    socialLinks: [
+      { icon: 'github', link: 'https://github.com/ServerlessLLM/pylet' },
+    ],
+
+    search: {
+      provider: 'local',
+    },
+
+    editLink: {
+      pattern: 'https://github.com/ServerlessLLM/pylet.github.io/edit/main/docs/:path',
+      text: 'Edit this page on GitHub',
+    },
+
+    footer: {
+      message: 'Released under the Apache 2.0 License.',
+      copyright: 'Copyright © 2024-present ServerlessLLM',
+    },
+
+    outline: {
+      level: [2, 3],
+    },
+  },
+})
@@ -0,0 +1,189 @@
+# Architecture
+
+How PyLet works under the hood. Read this if you're curious — it's not required for using PyLet.
+
+## System Design
+
+```
+┌──────────────┐     poke      ┌──────────────────┐
+│  Controller  │──────────────>│    Scheduler     │
+│  (FastAPI)   │               │   (in-process)   │
+└──────┬───────┘               └────────┬─────────┘
+       │                                │
+       │         ┌──────────────┐       │
+       └────────>│    SQLite    │<──────┘
+                 │     (WAL)    │
+                 └──────────────┘
+                       ▲
+                       │ heartbeat (long-poll)
+       ┌───────────────┴───────────────┐
+       │               │               │
+  ┌────┴────┐    ┌────┴────┐    ┌────┴────┐
+  │ Worker  │    │ Worker  │    │ Worker  │
+  └─────────┘    └─────────┘    └─────────┘
+```
+
+**Head node**: Runs the controller (FastAPI) and scheduler. SQLite is the single source of truth.
+
+**Workers**: Connect to head, receive desired state via heartbeat, reconcile local processes.
+
+---
+
+## The One Primitive: Instance
+
+PyLet has exactly one concept: the **instance** — a process with resource allocation.
+
+An instance has:
+- A command to run
+- Resource requirements (CPU, GPU, memory)
+- A lifecycle (PENDING → ASSIGNED → RUNNING → COMPLETED/FAILED)
+- An optional endpoint (host:port) for service discovery
+
+That's it. No pods, replicas, services, deployments, or jobs. Higher-level abstractions are left to the application.
+
+---
+
+## Worker Reconciliation
+
+Workers don't receive "start X" / "stop Y" commands. Instead, they receive **desired state** and reconcile:
+
+```
+Desired state (from head):  [instance_a@attempt=2, instance_b@attempt=1]
+Actual state (local):       [instance_a@attempt=1, instance_c@attempt=1]
+
+Reconcile:
+  instance_a@attempt=1  → stale attempt → kill
+  instance_b@attempt=1  → not running   → start
+  instance_c@attempt=1  → not desired   → kill
+```
+
+This declarative model means:
+- **Crash recovery is automatic**: worker restarts, gets desired state, reconciles
+- **Network partitions are safe**: stale workers can't corrupt state (attempt fencing)
+- **No command queue**: simpler than ack/retry protocols
+
+---
+
+## Instance Lifecycle
+
+```
+PENDING ──[assign]──> ASSIGNED ──[start]──> RUNNING ──[exit]──> COMPLETED
+    │                    │                     │                    │
+    │                    │                     │                 FAILED
+    │                    │                     │
+    │                    └─[worker offline]────┴──> UNKNOWN
+    │                                                   │
+    └──[cancel]──────────────────────────────────> CANCELLED
+```
+
+| State | Meaning |
+|:------|:--------|
+| PENDING | Waiting in queue |
+| ASSIGNED | Worker selected, resources reserved |
+| RUNNING | Process executing |
+| UNKNOWN | Worker offline, outcome uncertain |
+| COMPLETED | Exit code 0 |
+| FAILED | Exit code ≠ 0 |
+| CANCELLED | User cancelled |
+
+### Cancellation Model
+
+Cancellation uses a timestamp model (like Kubernetes `deletionTimestamp`):
+
+1. User requests cancel → `cancellation_requested_at` is set
+2. Instance excluded from desired state
+3. Worker sees absence, sends SIGTERM
+4. Grace period (default 30s)
+5. SIGKILL if still running
+6. Worker reports CANCELLED
+
+---
+
+## Heartbeat Protocol
+
+Workers use **generation-based long-polling**:
+
+1. Worker sends heartbeat with `last_seen_gen` and instance status reports
+2. Controller processes reports (with attempt fencing)
+3. Controller waits for state change or timeout (30s)
+4. Returns new `gen` and `desired_instances`
+
+**Cancel-and-reissue**: When local state changes (process starts/exits), the worker cancels the in-flight heartbeat and sends a new one immediately. This means the head gets updates within milliseconds.
+
+---
+
+## Attempt-Based Fencing
+
+Each instance has an `attempt` counter that increments on each assignment:
+
+```
+Instance assigned to Worker A (attempt=1)
+Network partition...
+Instance reassigned to Worker B (attempt=2)
+Worker A reconnects, reports for attempt=1
+→ Controller ignores (stale attempt)
+```
+
+Only reports matching the current attempt can change state. This prevents stale workers from corrupting cluster state.
+
+---
+
+## Fine-Grained GPU Scheduling
+
+These features exist because real research workloads need them:
+
+- **Physical GPU indices** (`gpu_indices`): Request specific GPUs. Exposed via `CUDA_VISIBLE_DEVICES`.
+- **GPU sharing** (`exclusive=False`): GPUs aren't reserved exclusively. Enables daemons to coexist with inference.
+- **Worker placement** (`target_worker`): Target a specific worker (e.g., where a model is cached).
+
+---
+
+## Log Capture
+
+Instance logs are captured using a **sidecar pattern**:
+
+1. Worker wraps each command: `(cmd) 2>&1 | python3 -m pylet.log_sidecar`
+2. Sidecar writes rotating log files in `~/.pylet/logs/`
+3. Worker runs an HTTP server (port 15599) for log retrieval
+4. Head proxies log requests to workers
+
+The sidecar survives even if the instance crashes, so logs are never lost.
+
+---
+
+## Components
+
+| File | Purpose |
+|:-----|:--------|
+| `controller.py` | Core scheduling and state management |
+| `worker.py` | Process management and reconciliation |
+| `schemas.py` | Pydantic models, state transitions |
+| `db.py` | SQLite persistence layer |
+| `server.py` | FastAPI HTTP endpoints |
+| `client.py` | Async HTTP client |
+
+---
+
+## Design Decisions
+
+| Decision | Choice | Why |
+|:---------|:-------|:----|
+| Database | SQLite (WAL mode) | Single file, no dependencies, survives restarts |
+| Heartbeat | Long-poll | Instant updates, natural liveness check |
+| State model | Declarative reconciliation | Automatic crash recovery, no command queue |
+| Head topology | Single head | Simpler than consensus, good enough for ~100 nodes |
+| GPU scheduling | Integer-based (not fractional) | Predictable, no oversubscription surprises |
+
+---
+
+## Limitations
+
+| Limitation | Value | Workaround |
+|:-----------|:------|:-----------|
+| Port range per worker | 101 ports (15600–15700) | Deploy fewer services per worker |
+| Log retention | 50 MB per instance | Use external log aggregation |
+| SQLite scale | ~10K instances | Archive completed instances |
+| Single head node | No redundancy | Run head on reliable hardware |
+| No load balancing | N/A | Use nginx/HAProxy externally |
+| No job dependencies | N/A | Handle in application logic |
+| No authentication | N/A | Use network-level security |