InftyAI
diff --git a/‎Cargo.lock‎
Lines changed: 870 additions & 504 deletions b/‎Cargo.lock‎
Lines changed: 870 additions & 504 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 12 additions & 2 deletions b/‎Cargo.toml‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 121 additions & 2 deletions b/‎README.md‎
Lines changed: 121 additions & 2 deletions
diff --git a/‎hack/README.md‎
Lines changed: 68 additions & 0 deletions b/‎hack/README.md‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎hack/scripts/test_api.sh‎
Lines changed: 53 additions & 0 deletions b/‎hack/scripts/test_api.sh‎
Lines changed: 53 additions & 0 deletions
@@ -13,8 +13,8 @@ reqwest = { version = "0.12", features = ["json"] }
 tokio = { version = "1", features = ["full"] }
 serde = { version = "1.0", features = ["derive"] }
 serde_derive = "1.0"
-env_logger = "0.11.6"
-log = "0.4.26"
+tracing = "0.1"
+tracing-subscriber = { version = "0.3", features = ["env-filter"] }
 indicatif = "0.18"
 dirs = "6.0.0"
 hf-hub = { version = "0.5.0", features = ["tokio"] }
@@ -26,5 +26,15 @@ rusqlite = { version = "0.32", features = ["bundled"] }
 rusqlite_migration = "1.3"
 regex = "1.11"
 
+# Web server
+axum = "0.7"
+tower = "0.4"
+tower-http = { version = "0.5", features = ["cors", "trace"] }
+uuid = { version = "1.0", features = ["v4", "serde"] }
+futures = "0.3"
+tokio-stream = "0.1"
+
 [dev-dependencies]
 tempfile = "3.12"
+tower = { version = "0.4", features = ["util"] }
+serde_json = "1.0"
@@ -21,6 +21,8 @@
 
 💻 **System Detection** - Automatic GPU detection and resource reporting
 
+🚀 **OpenAI-Compatible API** - RESTful API with streaming support
+
 ## Installation
 
 ### Install with Cargo
@@ -45,6 +47,8 @@ make build
 
 ## Quick Start
 
+### CLI Usage
+
 ```bash
 # Download a model
 puma pull inftyai/tiny-random-gpt2
@@ -62,6 +66,39 @@ puma info
 puma rm inftyai/tiny-random-gpt2
 ```
 
+### API Server
+
+```bash
+# Start the inference server
+puma serve
+
+# Server will start on http://0.0.0.0:8000
+# API endpoints:
+#   POST /v1/chat/completions
+#   POST /v1/completions
+#   GET  /v1/models
+#   GET  /v1/models/:model
+#   GET  /health
+```
+
+**Test the API:**
+
+```bash
+# Health check
+curl http://localhost:8000/health
+
+# Chat completion
+curl http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "inftyai/tiny-random-gpt2",
+    "messages": [{"role": "user", "content": "Hello!"}]
+  }'
+
+# Or use the test script
+./hack/scripts/test_api.sh
+```
+
 ## Commands
 
 | Command | Status | Description |
@@ -72,6 +109,7 @@ puma rm inftyai/tiny-random-gpt2
 | `rm <model>` | ✅ | Remove model and cache |
 | `info` | ✅ | Display system information |
 | `version` | ✅ | Show PUMA version |
+| `serve` | ✅ | Start OpenAI-compatible API server |
 | `ps` | 🚧 | List running models |
 | `run` | 🚧 | Start model inference |
 | `stop` | 🚧 | Stop running model |
@@ -106,6 +144,81 @@ puma ls llama -l author=meta
 
 **Available filters:** `author`, `task`, `license`, `provider`, `model_series`
 
+## API Server
+
+PUMA provides an OpenAI-compatible API server for model inference.
+
+### Starting the Server
+
+```bash
+# Default: 0.0.0.0:8000
+puma serve
+
+# Custom host and port
+puma serve --host 127.0.0.1 --port 3000
+```
+
+### API Endpoints
+
+#### Chat Completions (Recommended)
+```bash
+curl http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "inftyai/tiny-random-gpt2",
+    "messages": [
+      {"role": "system", "content": "You are a helpful assistant."},
+      {"role": "user", "content": "Hello!"}
+    ],
+    "max_tokens": 100,
+    "temperature": 0.7
+  }'
+```
+
+#### Streaming (Server-Sent Events)
+```bash
+curl http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "inftyai/tiny-random-gpt2",
+    "messages": [{"role": "user", "content": "Tell me a story"}],
+    "stream": true
+  }'
+```
+
+#### List Models
+```bash
+curl http://localhost:8000/v1/models
+```
+
+#### Health Check
+```bash
+curl http://localhost:8000/health
+# Returns: {"status":"ok","version":"0.0.2"}
+```
+
+### OpenAI Python Client
+
+PUMA is compatible with the OpenAI Python SDK:
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+    base_url="http://localhost:8000/v1",
+    api_key="dummy"  # Not required
+)
+
+response = client.chat.completions.create(
+    model="inftyai/tiny-random-gpt2",
+    messages=[
+        {"role": "user", "content": "Hello!"}
+    ]
+)
+
+print(response.choices[0].message.content)
+```
+
 ### Inspect Output
 
 ```bash
@@ -146,22 +259,28 @@ Models are stored with lowercase names for case-insensitive matching.
 # Build
 make build
 
-# Run tests (67 unit + 22 integration)
+# Run all tests
 make test
+
+# Test API manually
+./hack/scripts/test_api.sh
 ```
 
 ### Project Structure
 
 ```
 puma/
 ├── src/
-│   ├── cli/          # Command implementations (ls, rm, inspect)
+│   ├── api/          # OpenAI-compatible API
+│   ├── backend/      # Inference backends (Mock, MLX)
+│   ├── cli/          # Command implementations
 │   ├── downloader/   # HuggingFace download logic
 │   ├── registry/     # Model registry & metadata
 │   ├── storage/      # SQLite storage backend
 │   ├── system/       # System info detection
 │   └── utils/        # Formatting & helpers
 ├── tests/            # Integration tests
+├── hack/             # Development scripts
 ├── Cargo.toml        # Rust dependencies
 └── Makefile          # Build commands
 ```
 
@@ -0,0 +1,68 @@
+# Hack Directory
+
+Development and testing utilities for PUMA.
+
+## Structure
+
+```
+hack/
+└── scripts/          # Test and utility scripts
+    └── test_api.sh
+```
+
+## Scripts
+
+### `scripts/test_api.sh`
+
+Tests all PUMA API endpoints manually.
+
+**Usage:**
+```bash
+# Start PUMA server first
+./puma serve
+
+# In another terminal
+./hack/scripts/test_api.sh
+```
+
+**Tests:**
+- Health check
+- List models
+- Chat completion (non-streaming)
+- Chat completion (streaming)
+- Text completion
+
+**Requirements:**
+- Running PUMA server
+- `curl` and `jq` installed
+
+---
+
+
+## Adding New Scripts
+
+Place development and testing scripts in `hack/scripts/`:
+
+```bash
+# Create new script
+cat > hack/scripts/my_script.sh << 'EOF'
+#!/bin/bash
+# Your script here
+EOF
+
+# Make executable
+chmod +x hack/scripts/my_script.sh
+```
+
+---
+
+## Why "hack"?
+
+The `hack/` directory is a convention from Kubernetes and other projects for:
+- Development utilities
+- Test scripts
+- Build helpers
+- CI/CD scripts
+- One-off tools
+
+It keeps the root directory clean while providing a place for development tools.
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+echo "Testing PUMA OpenAI-Compatible API"
+echo "===================================="
+echo
+
+# Base URL
+BASE_URL="http://localhost:8000"
+
+echo "1. Health Check"
+curl -s "$BASE_URL/health"
+echo -e "\n"
+
+echo "2. List Models"
+curl -s "$BASE_URL/v1/models" | jq '.'
+echo
+
+echo "3. Chat Completion (Non-streaming)"
+curl -s "$BASE_URL/v1/chat/completions" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "test-model",
+    "messages": [
+      {"role": "user", "content": "Hello!"}
+    ],
+    "max_tokens": 50
+  }' | jq '.'
+echo
+
+echo "4. Chat Completion (Streaming)"
+curl -s -N "$BASE_URL/v1/chat/completions" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "test-model",
+    "messages": [
+      {"role": "user", "content": "Tell me a story"}
+    ],
+    "stream": true,
+    "max_tokens": 50
+  }'
+echo -e "\n"
+
+echo "5. Legacy Text Completion"
+curl -s "$BASE_URL/v1/completions" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "test-model",
+    "prompt": "Once upon a time",
+    "max_tokens": 50
+  }' | jq '.'
+echo
+
+echo "Done!"