Skip to content

Commit 4a7c14e

Browse files
unamedkrclaude
andauthored
feat(cli): ollama-parity — pull / list / run / serve subcommands (#42)
Addresses Reddit feedback (smahs9, Eyelbee): the embedded niche is real, but developer experience was behind ollama's "one command to chat" standard. This adds that layer without touching any C code. New subcommands in tools/tq: - tq pull MODEL — download from HuggingFace by short alias (e.g., llama3.2:1b) to ~/.cache/quantcpp/ - tq list — show cached + available models with aliases (supports --json for scripting) - tq run MODEL — auto-pull + interactive chat via ./build/quant - tq serve MODEL — auto-pull + OpenAI API server via quant-server Short aliases added: - smollm2 / smollm2:135m → SmolLM2-135M - qwen3.5 / qwen3.5:0.8b → Qwen3.5-0.8B - llama3.2 / llama3.2:1b → Llama-3.2-1B Implementation approach: - Zero C changes — entirely a Python shim in tools/tq - Reuses existing bindings/python quantcpp._MODEL_REGISTRY and quantcpp.download() (which already has progress bar + cache) - Subprocess-invokes ./build/quant and ./build/quant-server - Accepts local .gguf paths as well as registry aliases Verified end-to-end: - tq list (shows cached/remote status + aliases) - tq pull llama3.2:1b (downloads 770 MB with progress bar) - tq run smollm2:135m "prompt" (31 tok/s single-thread) - tq serve smollm2:135m --port 8766 (OpenAI /health + /v1/models work) Also: made numpy import optional (was top-level, broke non-bench commands if numpy wasn't installed). Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 64b0804 commit 4a7c14e

1 file changed

Lines changed: 263 additions & 8 deletions

File tree

tools/tq

Lines changed: 263 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,10 @@ import json
2626
import argparse
2727
import time
2828
import struct
29-
import numpy as np
29+
try:
30+
import numpy as np # optional — only used by bench/compare
31+
except ImportError:
32+
np = None
3033

3134
# Add bindings to path
3235
sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "../bindings/python"))
@@ -66,6 +69,225 @@ EXIT_LIB_MISSING = 2
6669
EXIT_MODEL_ERROR = 3
6770
EXIT_IO_ERROR = 4
6871

72+
# ═══════════════════════════════════════════════════════════
73+
# Ollama-style model registry (short alias → Python registry key)
74+
# ═══════════════════════════════════════════════════════════
75+
# User-friendly short names. Maps to quantcpp.* registry keys.
76+
MODEL_ALIASES = {
77+
"smollm2": "SmolLM2-135M",
78+
"smollm2:135m": "SmolLM2-135M",
79+
"qwen3.5": "Qwen3.5-0.8B",
80+
"qwen3.5:0.8b": "Qwen3.5-0.8B",
81+
"llama3.2": "Llama-3.2-1B",
82+
"llama3.2:1b": "Llama-3.2-1B",
83+
}
84+
85+
def resolve_model_name(name):
86+
"""Resolve user input to canonical registry key.
87+
88+
Accepts:
89+
- short alias (llama3.2:1b)
90+
- canonical key (Llama-3.2-1B)
91+
- local .gguf path
92+
"""
93+
if name is None:
94+
return None
95+
# Local file path takes precedence
96+
if os.path.exists(name) and name.endswith(".gguf"):
97+
return name
98+
# Short alias
99+
lower = name.lower()
100+
if lower in MODEL_ALIASES:
101+
return MODEL_ALIASES[lower]
102+
return name # try as-is (might match canonical key)
103+
104+
def _load_quantcpp():
105+
"""Import quantcpp bindings, exit with helpful error if missing."""
106+
try:
107+
import quantcpp
108+
return quantcpp
109+
except ImportError as e:
110+
print(f"{C.RED}error:{C.NC} quantcpp bindings not importable: {e}", file=sys.stderr)
111+
print(f" install: {C.CYAN}pip install quantcpp{C.NC}", file=sys.stderr)
112+
print(f" or dev: {C.CYAN}cd bindings/python && pip install -e .{C.NC}", file=sys.stderr)
113+
sys.exit(EXIT_LIB_MISSING)
114+
115+
def _find_quant_binary():
116+
"""Locate the ./build/quant binary relative to this script."""
117+
here = os.path.dirname(os.path.abspath(__file__))
118+
project = os.path.dirname(here)
119+
candidates = [
120+
os.path.join(project, "build", "quant"),
121+
os.path.join(project, "build_metal", "quant"),
122+
"quant", # in PATH
123+
]
124+
for c in candidates:
125+
if os.path.isfile(c) and os.access(c, os.X_OK):
126+
return c
127+
# shutil.which fallback
128+
import shutil
129+
found = shutil.which("quant")
130+
if found:
131+
return found
132+
return None
133+
134+
def _find_quant_server_binary():
135+
here = os.path.dirname(os.path.abspath(__file__))
136+
project = os.path.dirname(here)
137+
candidates = [
138+
os.path.join(project, "build", "quant-server"),
139+
os.path.join(project, "build_metal", "quant-server"),
140+
"quant-server",
141+
]
142+
for c in candidates:
143+
if os.path.isfile(c) and os.access(c, os.X_OK):
144+
return c
145+
import shutil
146+
return shutil.which("quant-server")
147+
148+
# ═══════════════════════════════════════════════════════════
149+
# Ollama-style commands: pull / list / run / serve
150+
# ═══════════════════════════════════════════════════════════
151+
152+
def cmd_pull(args):
153+
"""Download a model by short alias or canonical name."""
154+
quantcpp = _load_quantcpp()
155+
name = resolve_model_name(args.model)
156+
157+
# Check if it's a local path — already present, nothing to do
158+
if os.path.exists(name) and name.endswith(".gguf"):
159+
print(f"{C.GREEN}already local:{C.NC} {name}")
160+
return EXIT_OK
161+
162+
if name not in quantcpp._MODEL_REGISTRY:
163+
avail = ", ".join(sorted(quantcpp._MODEL_REGISTRY.keys()))
164+
aliases = ", ".join(sorted(MODEL_ALIASES.keys()))
165+
print(f"{C.RED}unknown model:{C.NC} {args.model!r}", file=sys.stderr)
166+
print(f" registry: {avail}", file=sys.stderr)
167+
print(f" aliases: {aliases}", file=sys.stderr)
168+
return EXIT_USAGE
169+
170+
print(f"{C.CYAN}pulling{C.NC} {name}...")
171+
try:
172+
path = quantcpp.download(name)
173+
size_mb = os.path.getsize(path) / (1024 * 1024)
174+
print(f"{C.GREEN}{C.NC} {name}{path} ({size_mb:.0f} MB)")
175+
return EXIT_OK
176+
except Exception as e:
177+
print(f"{C.RED}download failed:{C.NC} {e}", file=sys.stderr)
178+
return EXIT_IO_ERROR
179+
180+
def cmd_list(args):
181+
"""List cached models and registry availability."""
182+
quantcpp = _load_quantcpp()
183+
cache_dir = quantcpp._CACHE_DIR
184+
registry = quantcpp._MODEL_REGISTRY
185+
186+
rows = [] # (status, name, alias, size_mb, path)
187+
for name, (repo, filename, approx_mb) in sorted(registry.items()):
188+
path = cache_dir / filename
189+
if path.exists():
190+
size_mb = path.stat().st_size / (1024 * 1024)
191+
status = "cached"
192+
else:
193+
size_mb = approx_mb
194+
status = "remote"
195+
# find alias
196+
alias = next((a for a, n in MODEL_ALIASES.items() if n == name and ":" in a), "")
197+
rows.append((status, name, alias, size_mb, str(path) if status == "cached" else f"~{approx_mb} MB"))
198+
199+
if args.json_output:
200+
print(json.dumps([
201+
{"status": s, "name": n, "alias": a, "size_mb": round(sz, 1), "path": p}
202+
for (s, n, a, sz, p) in rows
203+
], indent=2))
204+
return EXIT_OK
205+
206+
print(f"\n {C.BOLD}Models{C.NC} cache: {cache_dir}\n")
207+
print(f" {C.BOLD}{'STATUS':<8} {'NAME':<16} {'ALIAS':<14} {'SIZE':>8}{C.NC}")
208+
print(f" {'─'*8} {'─'*16} {'─'*14} {'─'*8}")
209+
for status, name, alias, size_mb, path in rows:
210+
color = C.GREEN if status == "cached" else C.DIM
211+
size_str = f"{size_mb:.0f} MB"
212+
print(f" {color}{status:<8}{C.NC} {name:<16} {C.DIM}{alias:<14}{C.NC} {size_str:>8}")
213+
print()
214+
return EXIT_OK
215+
216+
def cmd_run(args):
217+
"""Run an interactive chat with a model (auto-pull if needed)."""
218+
quantcpp = _load_quantcpp()
219+
name = resolve_model_name(args.model)
220+
221+
# Resolve to local path (pull if needed)
222+
if os.path.exists(name) and name.endswith(".gguf"):
223+
model_path = name
224+
elif name in quantcpp._MODEL_REGISTRY:
225+
repo, filename, _ = quantcpp._MODEL_REGISTRY[name]
226+
cached = quantcpp._CACHE_DIR / filename
227+
if not cached.exists():
228+
print(f"{C.CYAN}model not cached — pulling{C.NC} {name}")
229+
try:
230+
model_path = quantcpp.download(name)
231+
except Exception as e:
232+
print(f"{C.RED}pull failed:{C.NC} {e}", file=sys.stderr)
233+
return EXIT_IO_ERROR
234+
else:
235+
model_path = str(cached)
236+
else:
237+
avail = ", ".join(sorted(quantcpp._MODEL_REGISTRY.keys()))
238+
print(f"{C.RED}unknown model:{C.NC} {args.model!r}", file=sys.stderr)
239+
print(f" available: {avail}", file=sys.stderr)
240+
return EXIT_USAGE
241+
242+
binary = _find_quant_binary()
243+
if not binary:
244+
print(f"{C.RED}quant binary not found:{C.NC} run `cmake --build build` first", file=sys.stderr)
245+
return EXIT_LIB_MISSING
246+
247+
cmd = [binary, model_path, "--chat"]
248+
if args.prompt:
249+
cmd += ["-p", args.prompt]
250+
cmd += ["-j", str(args.threads)]
251+
cmd += ["-n", str(args.max_tokens)]
252+
253+
print(f"{C.DIM}{' '.join(cmd)}{C.NC}")
254+
os.execvp(cmd[0], cmd)
255+
256+
def cmd_serve(args):
257+
"""Start OpenAI-compatible HTTP server (auto-pull if needed)."""
258+
quantcpp = _load_quantcpp()
259+
name = resolve_model_name(args.model)
260+
261+
if os.path.exists(name) and name.endswith(".gguf"):
262+
model_path = name
263+
elif name in quantcpp._MODEL_REGISTRY:
264+
repo, filename, _ = quantcpp._MODEL_REGISTRY[name]
265+
cached = quantcpp._CACHE_DIR / filename
266+
if not cached.exists():
267+
print(f"{C.CYAN}model not cached — pulling{C.NC} {name}")
268+
try:
269+
model_path = quantcpp.download(name)
270+
except Exception as e:
271+
print(f"{C.RED}pull failed:{C.NC} {e}", file=sys.stderr)
272+
return EXIT_IO_ERROR
273+
else:
274+
model_path = str(cached)
275+
else:
276+
print(f"{C.RED}unknown model:{C.NC} {args.model!r}", file=sys.stderr)
277+
return EXIT_USAGE
278+
279+
binary = _find_quant_server_binary()
280+
if not binary:
281+
print(f"{C.RED}quant-server binary not found:{C.NC} build with "
282+
f"`cmake -B build -DTQ_BUILD_SERVER=ON && cmake --build build`",
283+
file=sys.stderr)
284+
return EXIT_LIB_MISSING
285+
286+
cmd = [binary, model_path, "-p", str(args.port), "-j", str(args.threads)]
287+
print(f"{C.GREEN}quant serve{C.NC} {name} on :{args.port}")
288+
print(f"{C.DIM}{' '.join(cmd)}{C.NC}")
289+
os.execvp(cmd[0], cmd)
290+
69291
# ═══════════════════════════════════════════════════════════
70292
# COMMANDS
71293
# ═══════════════════════════════════════════════════════════
@@ -226,24 +448,49 @@ def main():
226448
formatter_class=argparse.RawDescriptionHelpFormatter,
227449
epilog="""
228450
commands:
451+
pull MODEL Download a model (e.g., llama3.2:1b)
452+
list List cached and available models
453+
run MODEL [PROMPT] Chat with a model (auto-pulls if needed)
454+
serve MODEL Start OpenAI-compatible HTTP server
229455
info Show quantization types and recommendations
230456
bench Run performance benchmark
231457
+memory MODEL CTX Calculate memory savings
232458
+compare Run A/B comparison (requires build)
233-
demo Chat with Qwen3.5-0.8B (native C engine)
459+
demo Chat with Qwen3.5-0.8B (legacy, use `run` instead)
234460
235461
examples:
236-
tq info
462+
tq pull llama3.2:1b
463+
tq list
464+
tq run llama3.2:1b
465+
tq run llama3.2:1b "What is gravity?"
466+
tq serve llama3.2:1b --port 8080
237467
tq info --json
238468
tq bench --seq-len 2048 --head-dim 256
239-
tq +memory llama-3.2-3b 65536
240-
tq +memory qwen3.5-0.8b 131072 --json
241-
tq demo "What is quantization?"
242-
tq demo --engine pytorch "What is quantization?"
243469
""")
244470
parser.add_argument("--json", dest="json_output", action="store_true", help="JSON output (for AI agents)")
245471
sub = parser.add_subparsers(dest="command")
246472

473+
# pull
474+
p_pull = sub.add_parser("pull", help="Download a model from HuggingFace")
475+
p_pull.add_argument("model", help="Model name or alias (e.g., llama3.2:1b)")
476+
477+
# list
478+
p_list = sub.add_parser("list", help="List cached and available models")
479+
p_list.add_argument("--json", dest="json_output", action="store_true")
480+
481+
# run
482+
p_run = sub.add_parser("run", help="Chat with a model (auto-pulls if needed)")
483+
p_run.add_argument("model", help="Model name or alias")
484+
p_run.add_argument("prompt", nargs="?", default=None, help="Optional prompt (interactive if omitted)")
485+
p_run.add_argument("-j", "--threads", type=int, default=4)
486+
p_run.add_argument("-n", "--max-tokens", type=int, default=256)
487+
488+
# serve
489+
p_serve = sub.add_parser("serve", help="Start OpenAI-compatible HTTP server")
490+
p_serve.add_argument("model", help="Model name or alias")
491+
p_serve.add_argument("-p", "--port", type=int, default=8080)
492+
p_serve.add_argument("-j", "--threads", type=int, default=4)
493+
247494
# info
248495
p_info = sub.add_parser("info", help="Quantization type information")
249496
p_info.add_argument("--json", dest="json_output", action="store_true")
@@ -275,7 +522,15 @@ examples:
275522
parser.print_help()
276523
return EXIT_USAGE
277524

278-
if args.command == "info":
525+
if args.command == "pull":
526+
return cmd_pull(args)
527+
elif args.command == "list":
528+
return cmd_list(args)
529+
elif args.command == "run":
530+
return cmd_run(args)
531+
elif args.command == "serve":
532+
return cmd_serve(args)
533+
elif args.command == "info":
279534
return cmd_info(args)
280535
elif args.command == "bench":
281536
return cmd_bench(args)

0 commit comments

Comments
 (0)