Skip to content

Commit 82b9cb6

Browse files
authored
Add model to serve command (#50)
* Add model to serve command Signed-off-by: kerthcet <kerthcet@gmail.com> * fix test Signed-off-by: kerthcet <kerthcet@gmail.com> --------- Signed-off-by: kerthcet <kerthcet@gmail.com>
1 parent e6e2b2b commit 82b9cb6

5 files changed

Lines changed: 92 additions & 13 deletions

File tree

README.md

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,8 @@ puma rm inftyai/tiny-random-gpt2
6969
### API Server
7070

7171
```bash
72-
# Start the inference server
73-
puma serve
72+
# Start the inference server with a model
73+
puma serve inftyai/tiny-random-gpt2
7474

7575
# Server will start on http://0.0.0.0:8000
7676
# API endpoints:
@@ -109,7 +109,7 @@ curl http://localhost:8000/v1/chat/completions \
109109
| `rm <model>` || Remove model and cache |
110110
| `info` || Display system information |
111111
| `version` || Show PUMA version |
112-
| `serve` || Start OpenAI-compatible API server |
112+
| `serve <model>` || Start OpenAI-compatible API server with a model |
113113
| `ps` | 🚧 | List running models |
114114
| `run` | 🚧 | Start model inference |
115115
| `stop` | 🚧 | Stop running model |
@@ -151,11 +151,14 @@ PUMA provides an OpenAI-compatible API server for model inference.
151151
### Starting the Server
152152

153153
```bash
154-
# Default: 0.0.0.0:8000
155-
puma serve
154+
# Start server with a model (default: 0.0.0.0:8000)
155+
puma serve inftyai/tiny-random-gpt2
156156

157157
# Custom host and port
158-
puma serve --host 127.0.0.1 --port 3000
158+
puma serve inftyai/tiny-random-gpt2 --host 127.0.0.1 --port 3000
159+
160+
# Model must be pulled first
161+
puma pull inftyai/tiny-random-gpt2
159162
```
160163

161164
### API Endpoints
@@ -188,13 +191,14 @@ curl http://localhost:8000/v1/chat/completions \
188191

189192
#### List Models
190193
```bash
194+
# Returns the currently loaded model
191195
curl http://localhost:8000/v1/models
192196
```
193197

194198
#### Health Check
195199
```bash
196200
curl http://localhost:8000/health
197-
# Returns: {"status":"ok","version":"0.0.2"}
201+
# Returns: {"status":"ok"}
198202
```
199203

200204
### OpenAI Python Client

src/api/routes.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,13 +60,11 @@ pub fn create_router<E: InferenceEngine + Clone + 'static>(
6060
#[derive(Serialize)]
6161
struct HealthResponse {
6262
status: String,
63-
version: String,
6463
}
6564

6665
/// Health check endpoint
6766
async fn health_check() -> Json<HealthResponse> {
6867
Json(HealthResponse {
6968
status: "ok".to_string(),
70-
version: env!("CARGO_PKG_VERSION").to_string(),
7169
})
7270
}

src/api/tests.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,6 @@ async fn test_health_check() {
100100

101101
assert_eq!(status, StatusCode::OK);
102102
assert_eq!(json["status"], "ok");
103-
assert!(json["version"].is_string());
104103
}
105104

106105
#[tokio::test]

src/cli/commands.rs

Lines changed: 75 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ enum Commands {
4343

4444
#[derive(Parser)]
4545
struct ServeArgs {
46+
/// Model name to serve (e.g., inftyai/tiny-random-gpt2)
47+
model: String,
48+
4649
/// Host address to bind to
4750
#[arg(long, default_value = "0.0.0.0")]
4851
host: String,
@@ -221,7 +224,24 @@ pub async fn run(cli: Cli) {
221224
}
222225

223226
Commands::SERVE(args) => {
224-
if let Err(e) = crate::cli::serve::execute(&args.host, args.port).await {
227+
// Verify model exists
228+
let registry = ModelRegistry::new(None);
229+
match registry.get_model(&args.model) {
230+
Ok(Some(_)) => {
231+
// Model exists, proceed
232+
}
233+
Ok(None) => {
234+
eprintln!("❌ Error: Model '{}' not found in registry", args.model);
235+
eprintln!("Run 'puma pull {}' to download it first", args.model);
236+
std::process::exit(1);
237+
}
238+
Err(e) => {
239+
eprintln!("❌ Error checking model: {}", e);
240+
std::process::exit(1);
241+
}
242+
}
243+
244+
if let Err(e) = crate::cli::serve::execute(&args.host, args.port, &args.model).await {
225245
eprintln!("Error starting server: {}", e);
226246
std::process::exit(1);
227247
}
@@ -392,4 +412,58 @@ mod tests {
392412
assert_eq!(result.metadata.cache.revision, "v2");
393413
assert_eq!(result.metadata.cache.size, 2000);
394414
}
415+
416+
#[test]
417+
fn test_serve_with_existing_model() {
418+
let temp_dir = TempDir::new().unwrap();
419+
let registry = ModelRegistry::new(Some(temp_dir.path().to_path_buf()));
420+
421+
let model = create_test_model("test/serve-model", "abc123");
422+
registry.register_model(model).unwrap();
423+
424+
// Verify model exists (this is what serve command checks)
425+
let result = registry.get_model("test/serve-model");
426+
assert!(result.is_ok());
427+
assert!(result.unwrap().is_some());
428+
}
429+
430+
#[test]
431+
fn test_serve_with_nonexistent_model() {
432+
let temp_dir = TempDir::new().unwrap();
433+
let registry = ModelRegistry::new(Some(temp_dir.path().to_path_buf()));
434+
435+
// Verify model doesn't exist
436+
let result = registry.get_model("nonexistent/model");
437+
assert!(result.is_ok());
438+
assert!(result.unwrap().is_none());
439+
}
440+
441+
#[test]
442+
fn test_serve_args_parsing() {
443+
// Test that ServeArgs requires model argument
444+
use clap::CommandFactory;
445+
let app = Cli::command();
446+
447+
// This should fail without model argument
448+
let result = app.clone().try_get_matches_from(vec!["puma", "serve"]);
449+
assert!(result.is_err());
450+
451+
// This should succeed with model argument
452+
let result = app
453+
.clone()
454+
.try_get_matches_from(vec!["puma", "serve", "test/model"]);
455+
assert!(result.is_ok());
456+
457+
// This should succeed with model and optional args
458+
let result = app.try_get_matches_from(vec![
459+
"puma",
460+
"serve",
461+
"test/model",
462+
"--host",
463+
"127.0.0.1",
464+
"--port",
465+
"9000",
466+
]);
467+
assert!(result.is_ok());
468+
}
395469
}

src/cli/serve.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,11 @@ use crate::backend::mock::MockEngine;
77
use crate::registry::model_registry::ModelRegistry;
88

99
/// Execute the serve command
10-
pub async fn execute(host: &str, port: u16) -> Result<(), Box<dyn std::error::Error>> {
10+
pub async fn execute(
11+
host: &str,
12+
port: u16,
13+
model_name: &str,
14+
) -> Result<(), Box<dyn std::error::Error>> {
1115
println!(
1216
"{}",
1317
"
@@ -23,7 +27,7 @@ pub async fn execute(host: &str, port: u16) -> Result<(), Box<dyn std::error::Er
2327
.bright_blue()
2428
.bold()
2529
);
26-
info!("Starting PUMA inference server");
30+
info!("Starting PUMA to serve model: {}", model_name);
2731

2832
// Initialize backend (MockEngine for now, replace with MLX later)
2933
let engine = Arc::new(MockEngine::new());

0 commit comments

Comments
 (0)