@@ -26,7 +26,10 @@ import json
2626import argparse
2727import time
2828import struct
29- import numpy as np
29+ try :
30+ import numpy as np # optional — only used by bench/compare
31+ except ImportError :
32+ np = None
3033
3134# Add bindings to path
3235sys .path .insert (0 , os .path .join (os .path .dirname (os .path .abspath (__file__ )), "../bindings/python" ))
@@ -66,6 +69,225 @@ EXIT_LIB_MISSING = 2
6669EXIT_MODEL_ERROR = 3
6770EXIT_IO_ERROR = 4
6871
72+ # ═══════════════════════════════════════════════════════════
73+ # Ollama-style model registry (short alias → Python registry key)
74+ # ═══════════════════════════════════════════════════════════
75+ # User-friendly short names. Maps to quantcpp.* registry keys.
76+ MODEL_ALIASES = {
77+ "smollm2" : "SmolLM2-135M" ,
78+ "smollm2:135m" : "SmolLM2-135M" ,
79+ "qwen3.5" : "Qwen3.5-0.8B" ,
80+ "qwen3.5:0.8b" : "Qwen3.5-0.8B" ,
81+ "llama3.2" : "Llama-3.2-1B" ,
82+ "llama3.2:1b" : "Llama-3.2-1B" ,
83+ }
84+
85+ def resolve_model_name (name ):
86+ """Resolve user input to canonical registry key.
87+
88+ Accepts:
89+ - short alias (llama3.2:1b)
90+ - canonical key (Llama-3.2-1B)
91+ - local .gguf path
92+ """
93+ if name is None :
94+ return None
95+ # Local file path takes precedence
96+ if os .path .exists (name ) and name .endswith (".gguf" ):
97+ return name
98+ # Short alias
99+ lower = name .lower ()
100+ if lower in MODEL_ALIASES :
101+ return MODEL_ALIASES [lower ]
102+ return name # try as-is (might match canonical key)
103+
104+ def _load_quantcpp ():
105+ """Import quantcpp bindings, exit with helpful error if missing."""
106+ try :
107+ import quantcpp
108+ return quantcpp
109+ except ImportError as e :
110+ print (f"{ C .RED } error:{ C .NC } quantcpp bindings not importable: { e } " , file = sys .stderr )
111+ print (f" install: { C .CYAN } pip install quantcpp{ C .NC } " , file = sys .stderr )
112+ print (f" or dev: { C .CYAN } cd bindings/python && pip install -e .{ C .NC } " , file = sys .stderr )
113+ sys .exit (EXIT_LIB_MISSING )
114+
115+ def _find_quant_binary ():
116+ """Locate the ./build/quant binary relative to this script."""
117+ here = os .path .dirname (os .path .abspath (__file__ ))
118+ project = os .path .dirname (here )
119+ candidates = [
120+ os .path .join (project , "build" , "quant" ),
121+ os .path .join (project , "build_metal" , "quant" ),
122+ "quant" , # in PATH
123+ ]
124+ for c in candidates :
125+ if os .path .isfile (c ) and os .access (c , os .X_OK ):
126+ return c
127+ # shutil.which fallback
128+ import shutil
129+ found = shutil .which ("quant" )
130+ if found :
131+ return found
132+ return None
133+
134+ def _find_quant_server_binary ():
135+ here = os .path .dirname (os .path .abspath (__file__ ))
136+ project = os .path .dirname (here )
137+ candidates = [
138+ os .path .join (project , "build" , "quant-server" ),
139+ os .path .join (project , "build_metal" , "quant-server" ),
140+ "quant-server" ,
141+ ]
142+ for c in candidates :
143+ if os .path .isfile (c ) and os .access (c , os .X_OK ):
144+ return c
145+ import shutil
146+ return shutil .which ("quant-server" )
147+
148+ # ═══════════════════════════════════════════════════════════
149+ # Ollama-style commands: pull / list / run / serve
150+ # ═══════════════════════════════════════════════════════════
151+
152+ def cmd_pull (args ):
153+ """Download a model by short alias or canonical name."""
154+ quantcpp = _load_quantcpp ()
155+ name = resolve_model_name (args .model )
156+
157+ # Check if it's a local path — already present, nothing to do
158+ if os .path .exists (name ) and name .endswith (".gguf" ):
159+ print (f"{ C .GREEN } already local:{ C .NC } { name } " )
160+ return EXIT_OK
161+
162+ if name not in quantcpp ._MODEL_REGISTRY :
163+ avail = ", " .join (sorted (quantcpp ._MODEL_REGISTRY .keys ()))
164+ aliases = ", " .join (sorted (MODEL_ALIASES .keys ()))
165+ print (f"{ C .RED } unknown model:{ C .NC } { args .model !r} " , file = sys .stderr )
166+ print (f" registry: { avail } " , file = sys .stderr )
167+ print (f" aliases: { aliases } " , file = sys .stderr )
168+ return EXIT_USAGE
169+
170+ print (f"{ C .CYAN } pulling{ C .NC } { name } ..." )
171+ try :
172+ path = quantcpp .download (name )
173+ size_mb = os .path .getsize (path ) / (1024 * 1024 )
174+ print (f"{ C .GREEN } ✓{ C .NC } { name } → { path } ({ size_mb :.0f} MB)" )
175+ return EXIT_OK
176+ except Exception as e :
177+ print (f"{ C .RED } download failed:{ C .NC } { e } " , file = sys .stderr )
178+ return EXIT_IO_ERROR
179+
180+ def cmd_list (args ):
181+ """List cached models and registry availability."""
182+ quantcpp = _load_quantcpp ()
183+ cache_dir = quantcpp ._CACHE_DIR
184+ registry = quantcpp ._MODEL_REGISTRY
185+
186+ rows = [] # (status, name, alias, size_mb, path)
187+ for name , (repo , filename , approx_mb ) in sorted (registry .items ()):
188+ path = cache_dir / filename
189+ if path .exists ():
190+ size_mb = path .stat ().st_size / (1024 * 1024 )
191+ status = "cached"
192+ else :
193+ size_mb = approx_mb
194+ status = "remote"
195+ # find alias
196+ alias = next ((a for a , n in MODEL_ALIASES .items () if n == name and ":" in a ), "" )
197+ rows .append ((status , name , alias , size_mb , str (path ) if status == "cached" else f"~{ approx_mb } MB" ))
198+
199+ if args .json_output :
200+ print (json .dumps ([
201+ {"status" : s , "name" : n , "alias" : a , "size_mb" : round (sz , 1 ), "path" : p }
202+ for (s , n , a , sz , p ) in rows
203+ ], indent = 2 ))
204+ return EXIT_OK
205+
206+ print (f"\n { C .BOLD } Models{ C .NC } cache: { cache_dir } \n " )
207+ print (f" { C .BOLD } { 'STATUS' :<8} { 'NAME' :<16} { 'ALIAS' :<14} { 'SIZE' :>8} { C .NC } " )
208+ print (f" { '─' * 8 } { '─' * 16 } { '─' * 14 } { '─' * 8 } " )
209+ for status , name , alias , size_mb , path in rows :
210+ color = C .GREEN if status == "cached" else C .DIM
211+ size_str = f"{ size_mb :.0f} MB"
212+ print (f" { color } { status :<8} { C .NC } { name :<16} { C .DIM } { alias :<14} { C .NC } { size_str :>8} " )
213+ print ()
214+ return EXIT_OK
215+
216+ def cmd_run (args ):
217+ """Run an interactive chat with a model (auto-pull if needed)."""
218+ quantcpp = _load_quantcpp ()
219+ name = resolve_model_name (args .model )
220+
221+ # Resolve to local path (pull if needed)
222+ if os .path .exists (name ) and name .endswith (".gguf" ):
223+ model_path = name
224+ elif name in quantcpp ._MODEL_REGISTRY :
225+ repo , filename , _ = quantcpp ._MODEL_REGISTRY [name ]
226+ cached = quantcpp ._CACHE_DIR / filename
227+ if not cached .exists ():
228+ print (f"{ C .CYAN } model not cached — pulling{ C .NC } { name } " )
229+ try :
230+ model_path = quantcpp .download (name )
231+ except Exception as e :
232+ print (f"{ C .RED } pull failed:{ C .NC } { e } " , file = sys .stderr )
233+ return EXIT_IO_ERROR
234+ else :
235+ model_path = str (cached )
236+ else :
237+ avail = ", " .join (sorted (quantcpp ._MODEL_REGISTRY .keys ()))
238+ print (f"{ C .RED } unknown model:{ C .NC } { args .model !r} " , file = sys .stderr )
239+ print (f" available: { avail } " , file = sys .stderr )
240+ return EXIT_USAGE
241+
242+ binary = _find_quant_binary ()
243+ if not binary :
244+ print (f"{ C .RED } quant binary not found:{ C .NC } run `cmake --build build` first" , file = sys .stderr )
245+ return EXIT_LIB_MISSING
246+
247+ cmd = [binary , model_path , "--chat" ]
248+ if args .prompt :
249+ cmd += ["-p" , args .prompt ]
250+ cmd += ["-j" , str (args .threads )]
251+ cmd += ["-n" , str (args .max_tokens )]
252+
253+ print (f"{ C .DIM } → { ' ' .join (cmd )} { C .NC } " )
254+ os .execvp (cmd [0 ], cmd )
255+
256+ def cmd_serve (args ):
257+ """Start OpenAI-compatible HTTP server (auto-pull if needed)."""
258+ quantcpp = _load_quantcpp ()
259+ name = resolve_model_name (args .model )
260+
261+ if os .path .exists (name ) and name .endswith (".gguf" ):
262+ model_path = name
263+ elif name in quantcpp ._MODEL_REGISTRY :
264+ repo , filename , _ = quantcpp ._MODEL_REGISTRY [name ]
265+ cached = quantcpp ._CACHE_DIR / filename
266+ if not cached .exists ():
267+ print (f"{ C .CYAN } model not cached — pulling{ C .NC } { name } " )
268+ try :
269+ model_path = quantcpp .download (name )
270+ except Exception as e :
271+ print (f"{ C .RED } pull failed:{ C .NC } { e } " , file = sys .stderr )
272+ return EXIT_IO_ERROR
273+ else :
274+ model_path = str (cached )
275+ else :
276+ print (f"{ C .RED } unknown model:{ C .NC } { args .model !r} " , file = sys .stderr )
277+ return EXIT_USAGE
278+
279+ binary = _find_quant_server_binary ()
280+ if not binary :
281+ print (f"{ C .RED } quant-server binary not found:{ C .NC } build with "
282+ f"`cmake -B build -DTQ_BUILD_SERVER=ON && cmake --build build`" ,
283+ file = sys .stderr )
284+ return EXIT_LIB_MISSING
285+
286+ cmd = [binary , model_path , "-p" , str (args .port ), "-j" , str (args .threads )]
287+ print (f"{ C .GREEN } quant serve{ C .NC } { name } on :{ args .port } " )
288+ print (f"{ C .DIM } → { ' ' .join (cmd )} { C .NC } " )
289+ os .execvp (cmd [0 ], cmd )
290+
69291# ═══════════════════════════════════════════════════════════
70292# COMMANDS
71293# ═══════════════════════════════════════════════════════════
@@ -226,24 +448,49 @@ def main():
226448 formatter_class = argparse .RawDescriptionHelpFormatter ,
227449 epilog = """
228450commands:
451+ pull MODEL Download a model (e.g., llama3.2:1b)
452+ list List cached and available models
453+ run MODEL [PROMPT] Chat with a model (auto-pulls if needed)
454+ serve MODEL Start OpenAI-compatible HTTP server
229455 info Show quantization types and recommendations
230456 bench Run performance benchmark
231457 +memory MODEL CTX Calculate memory savings
232458 +compare Run A/B comparison (requires build)
233- demo Chat with Qwen3.5-0.8B (native C engine )
459+ demo Chat with Qwen3.5-0.8B (legacy, use `run` instead )
234460
235461examples:
236- tq info
462+ tq pull llama3.2:1b
463+ tq list
464+ tq run llama3.2:1b
465+ tq run llama3.2:1b "What is gravity?"
466+ tq serve llama3.2:1b --port 8080
237467 tq info --json
238468 tq bench --seq-len 2048 --head-dim 256
239- tq +memory llama-3.2-3b 65536
240- tq +memory qwen3.5-0.8b 131072 --json
241- tq demo "What is quantization?"
242- tq demo --engine pytorch "What is quantization?"
243469""" )
244470 parser .add_argument ("--json" , dest = "json_output" , action = "store_true" , help = "JSON output (for AI agents)" )
245471 sub = parser .add_subparsers (dest = "command" )
246472
473+ # pull
474+ p_pull = sub .add_parser ("pull" , help = "Download a model from HuggingFace" )
475+ p_pull .add_argument ("model" , help = "Model name or alias (e.g., llama3.2:1b)" )
476+
477+ # list
478+ p_list = sub .add_parser ("list" , help = "List cached and available models" )
479+ p_list .add_argument ("--json" , dest = "json_output" , action = "store_true" )
480+
481+ # run
482+ p_run = sub .add_parser ("run" , help = "Chat with a model (auto-pulls if needed)" )
483+ p_run .add_argument ("model" , help = "Model name or alias" )
484+ p_run .add_argument ("prompt" , nargs = "?" , default = None , help = "Optional prompt (interactive if omitted)" )
485+ p_run .add_argument ("-j" , "--threads" , type = int , default = 4 )
486+ p_run .add_argument ("-n" , "--max-tokens" , type = int , default = 256 )
487+
488+ # serve
489+ p_serve = sub .add_parser ("serve" , help = "Start OpenAI-compatible HTTP server" )
490+ p_serve .add_argument ("model" , help = "Model name or alias" )
491+ p_serve .add_argument ("-p" , "--port" , type = int , default = 8080 )
492+ p_serve .add_argument ("-j" , "--threads" , type = int , default = 4 )
493+
247494 # info
248495 p_info = sub .add_parser ("info" , help = "Quantization type information" )
249496 p_info .add_argument ("--json" , dest = "json_output" , action = "store_true" )
@@ -275,7 +522,15 @@ examples:
275522 parser .print_help ()
276523 return EXIT_USAGE
277524
278- if args .command == "info" :
525+ if args .command == "pull" :
526+ return cmd_pull (args )
527+ elif args .command == "list" :
528+ return cmd_list (args )
529+ elif args .command == "run" :
530+ return cmd_run (args )
531+ elif args .command == "serve" :
532+ return cmd_serve (args )
533+ elif args .command == "info" :
279534 return cmd_info (args )
280535 elif args .command == "bench" :
281536 return cmd_bench (args )
0 commit comments