Skip to content

Commit 79db43a

Browse files
unamedkrclaude
andauthored
feat(pypi): quantcpp CLI ollama-parity (pull/list/run/serve) (#43)
PR #42 added these commands to tools/tq, but that's only available to people who clone the repo. For PyPI users running `pip install quantcpp`, the entry point is `quantcpp = quantcpp.cli:main`, which only had a single chat shortcut. This adds proper ollama-style subcommands to bindings/python/quantcpp/ cli.py so PyPI users get the same DX: pip install quantcpp quantcpp pull llama3.2:1b quantcpp list quantcpp run llama3.2:1b quantcpp serve llama3.2:1b --port 8080 Short aliases (smollm2:135m, qwen3.5:0.8b, llama3.2:1b) match the tools/tq registry. Backwards-compatible: bare `quantcpp` and `quantcpp "question"` still work. Bumped version 0.11.0 → 0.12.0. Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 4a7c14e commit 79db43a

2 files changed

Lines changed: 257 additions & 32 deletions

File tree

bindings/python/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
77

88
[project]
99
name = "quantcpp"
10-
version = "0.11.0"
10+
version = "0.12.0"
1111
description = "Single-header LLM inference engine with KV cache compression (7× compression at fp32 parity)"
1212
readme = "README.md"
1313
license = { text = "Apache-2.0" }

bindings/python/quantcpp/cli.py

Lines changed: 256 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,50 +1,157 @@
11
"""
22
quantcpp CLI — chat with a local LLM in your terminal.
33
4-
Usage:
5-
quantcpp # auto-downloads Llama-3.2-1B, starts chat
6-
quantcpp "What is gravity?" # one-shot question
7-
quantcpp --model SmolLM2-135M # use a smaller model (faster download)
8-
quantcpp --model path/to/file.gguf # use your own GGUF file
4+
Ollama-style commands:
5+
quantcpp pull MODEL Download a model from HuggingFace
6+
quantcpp list List cached and available models
7+
quantcpp run MODEL [Q] Chat with a model (auto-pulls if needed)
8+
quantcpp serve MODEL Start OpenAI-compatible HTTP server
9+
10+
Backwards-compatible shortcut:
11+
quantcpp Auto-downloads Llama-3.2-1B, starts chat
12+
quantcpp "What is X?" One-shot question with default model
13+
quantcpp --model NAME Use a specific model
914
"""
1015

1116
import sys
1217
import os
18+
import json
1319

1420

15-
def main():
16-
import argparse
17-
parser = argparse.ArgumentParser(
18-
prog="quantcpp",
19-
description="Chat with a local LLM. No API key, no GPU, no server.",
20-
)
21-
parser.add_argument("prompt", nargs="*", help="Question to ask (omit for interactive chat)")
22-
parser.add_argument("--model", "-m", default="Llama-3.2-1B",
23-
help="Model name or path to .gguf file (default: Llama-3.2-1B)")
24-
parser.add_argument("--max-tokens", "-n", type=int, default=256)
25-
parser.add_argument("--temperature", "-t", type=float, default=0.7)
26-
args = parser.parse_args()
21+
# Ollama-style short aliases → canonical _MODEL_REGISTRY keys
22+
MODEL_ALIASES = {
23+
"smollm2": "SmolLM2-135M",
24+
"smollm2:135m": "SmolLM2-135M",
25+
"qwen3.5": "Qwen3.5-0.8B",
26+
"qwen3.5:0.8b": "Qwen3.5-0.8B",
27+
"llama3.2": "Llama-3.2-1B",
28+
"llama3.2:1b": "Llama-3.2-1B",
29+
}
2730

28-
from quantcpp import Model
2931

30-
# Load model
31-
model_path = args.model
32-
if os.path.isfile(model_path):
33-
print(f"Loading {model_path}...", file=sys.stderr)
34-
m = Model(model_path, max_tokens=args.max_tokens, temperature=args.temperature)
35-
else:
36-
print(f"Downloading {model_path}...", file=sys.stderr)
37-
m = Model.from_pretrained(model_path, max_tokens=args.max_tokens,
38-
temperature=args.temperature)
32+
def _resolve_name(name):
33+
"""Resolve user input to canonical registry key or local path."""
34+
if name is None:
35+
return None
36+
if os.path.exists(name) and name.endswith(".gguf"):
37+
return name
38+
return MODEL_ALIASES.get(name.lower(), name)
39+
40+
41+
def _registry():
42+
from quantcpp import _MODEL_REGISTRY, _CACHE_DIR
43+
return _MODEL_REGISTRY, _CACHE_DIR
44+
45+
46+
def cmd_pull(args):
47+
"""Download a model by alias or canonical name."""
48+
import quantcpp
49+
name = _resolve_name(args.model)
50+
51+
if os.path.exists(name) and name.endswith(".gguf"):
52+
print(f"already local: {name}")
53+
return 0
54+
55+
if name not in quantcpp._MODEL_REGISTRY:
56+
avail = ", ".join(sorted(quantcpp._MODEL_REGISTRY.keys()))
57+
aliases = ", ".join(sorted(MODEL_ALIASES.keys()))
58+
print(f"unknown model: {args.model!r}", file=sys.stderr)
59+
print(f" registry: {avail}", file=sys.stderr)
60+
print(f" aliases: {aliases}", file=sys.stderr)
61+
return 1
62+
63+
print(f"pulling {name}...", file=sys.stderr)
64+
try:
65+
path = quantcpp.download(name)
66+
size_mb = os.path.getsize(path) / (1024 * 1024)
67+
print(f"\u2713 {name} \u2192 {path} ({size_mb:.0f} MB)", file=sys.stderr)
68+
return 0
69+
except Exception as e:
70+
print(f"download failed: {e}", file=sys.stderr)
71+
return 1
72+
73+
74+
def cmd_list(args):
75+
"""List cached and available models."""
76+
registry, cache_dir = _registry()
77+
78+
rows = []
79+
for name, (repo, filename, approx_mb) in sorted(registry.items()):
80+
path = cache_dir / filename
81+
if path.exists():
82+
size_mb = path.stat().st_size / (1024 * 1024)
83+
status = "cached"
84+
display_path = str(path)
85+
else:
86+
size_mb = approx_mb
87+
status = "remote"
88+
display_path = f"~{approx_mb} MB"
89+
alias = next((a for a, n in MODEL_ALIASES.items() if n == name and ":" in a), "")
90+
rows.append((status, name, alias, size_mb, display_path))
91+
92+
if args.json_output:
93+
print(json.dumps([
94+
{"status": s, "name": n, "alias": a, "size_mb": round(sz, 1), "path": p}
95+
for (s, n, a, sz, p) in rows
96+
], indent=2))
97+
return 0
98+
99+
print(f"\n Models cache: {cache_dir}\n")
100+
print(f" {'STATUS':<8} {'NAME':<16} {'ALIAS':<14} {'SIZE':>8}")
101+
print(f" {'-'*8} {'-'*16} {'-'*14} {'-'*8}")
102+
for status, name, alias, size_mb, _ in rows:
103+
size_str = f"{size_mb:.0f} MB"
104+
print(f" {status:<8} {name:<16} {alias:<14} {size_str:>8}")
105+
print()
106+
return 0
107+
108+
109+
def _resolve_to_path(name_or_path):
110+
"""Resolve alias/name to a local .gguf path, downloading if needed."""
111+
import quantcpp
112+
name = _resolve_name(name_or_path)
113+
114+
if os.path.exists(name) and name.endswith(".gguf"):
115+
return name
116+
117+
if name not in quantcpp._MODEL_REGISTRY:
118+
avail = ", ".join(sorted(quantcpp._MODEL_REGISTRY.keys()))
119+
raise ValueError(
120+
f"unknown model: {name_or_path!r}. Available: {avail}"
121+
)
122+
123+
repo, filename, _ = quantcpp._MODEL_REGISTRY[name]
124+
cached = quantcpp._CACHE_DIR / filename
125+
if cached.exists():
126+
return str(cached)
127+
128+
print(f"model not cached \u2014 pulling {name}...", file=sys.stderr)
129+
return quantcpp.download(name)
130+
131+
132+
def cmd_run(args):
133+
"""Chat with a model (auto-pull if needed)."""
134+
try:
135+
model_path = _resolve_to_path(args.model)
136+
except ValueError as e:
137+
print(str(e), file=sys.stderr)
138+
return 1
139+
except Exception as e:
140+
print(f"pull failed: {e}", file=sys.stderr)
141+
return 1
142+
143+
from quantcpp import Model
144+
print(f"loading {os.path.basename(model_path)}...", file=sys.stderr)
145+
m = Model(model_path, max_tokens=args.max_tokens, temperature=args.temperature,
146+
n_threads=args.threads)
39147

40-
# One-shot or interactive
41148
if args.prompt:
42-
question = " ".join(args.prompt)
149+
question = " ".join(args.prompt) if isinstance(args.prompt, list) else args.prompt
43150
for tok in m.generate(question):
44151
print(tok, end="", flush=True)
45152
print()
46153
else:
47-
print("quantcpp type your message, Ctrl+C to exit", file=sys.stderr)
154+
print("quantcpp \u2014 type your message, Ctrl+C to exit", file=sys.stderr)
48155
try:
49156
while True:
50157
question = input("\nYou: ")
@@ -58,7 +165,125 @@ def main():
58165
print("\nBye!", file=sys.stderr)
59166

60167
m.close()
168+
return 0
169+
170+
171+
def cmd_serve(args):
172+
"""Start OpenAI-compatible HTTP server (requires quant-server binary)."""
173+
import shutil
174+
import subprocess
175+
176+
try:
177+
model_path = _resolve_to_path(args.model)
178+
except Exception as e:
179+
print(f"error: {e}", file=sys.stderr)
180+
return 1
181+
182+
binary = shutil.which("quant-server")
183+
if not binary:
184+
# Look in common build dirs relative to repo
185+
for guess in ("./build/quant-server", "./build_metal/quant-server"):
186+
if os.path.isfile(guess) and os.access(guess, os.X_OK):
187+
binary = guess
188+
break
189+
190+
if not binary:
191+
print("quant-server binary not found.", file=sys.stderr)
192+
print(" Build with: cmake -B build -DTQ_BUILD_SERVER=ON && cmake --build build",
193+
file=sys.stderr)
194+
print(" Or install via your package manager.", file=sys.stderr)
195+
return 2
196+
197+
cmd = [binary, model_path, "-p", str(args.port), "-j", str(args.threads)]
198+
print(f"quant serve {os.path.basename(model_path)} on :{args.port}", file=sys.stderr)
199+
os.execvp(cmd[0], cmd)
200+
201+
202+
def cmd_chat_default(args):
203+
"""Backwards-compatible default: auto-download Llama-3.2-1B and chat."""
204+
args.model = args.model or "Llama-3.2-1B"
205+
args.threads = getattr(args, "threads", 4)
206+
args.max_tokens = getattr(args, "max_tokens", 256)
207+
args.temperature = getattr(args, "temperature", 0.7)
208+
args.prompt = args.prompt or None
209+
return cmd_run(args)
210+
211+
212+
def main():
213+
import argparse
214+
215+
parser = argparse.ArgumentParser(
216+
prog="quantcpp",
217+
description="Chat with a local LLM. No API key, no GPU, no server.",
218+
formatter_class=argparse.RawDescriptionHelpFormatter,
219+
epilog="""
220+
commands:
221+
pull MODEL Download a model (e.g. llama3.2:1b)
222+
list List cached and available models
223+
run MODEL [PROMPT] Chat with a model (auto-pulls if needed)
224+
serve MODEL Start OpenAI-compatible HTTP server
225+
226+
examples:
227+
quantcpp pull llama3.2:1b
228+
quantcpp list
229+
quantcpp run llama3.2:1b
230+
quantcpp run llama3.2:1b "What is gravity?"
231+
quantcpp serve llama3.2:1b --port 8080
232+
233+
backwards-compat (no subcommand):
234+
quantcpp # default chat with Llama-3.2-1B
235+
quantcpp "What is gravity?" # one-shot
236+
quantcpp --model SmolLM2-135M # different model
237+
""",
238+
)
239+
240+
sub = parser.add_subparsers(dest="command")
241+
242+
# pull
243+
p_pull = sub.add_parser("pull", help="Download a model from HuggingFace")
244+
p_pull.add_argument("model", help="Model name or alias (e.g. llama3.2:1b)")
245+
246+
# list
247+
p_list = sub.add_parser("list", help="List cached and available models")
248+
p_list.add_argument("--json", dest="json_output", action="store_true")
249+
250+
# run
251+
p_run = sub.add_parser("run", help="Chat with a model (auto-pulls if needed)")
252+
p_run.add_argument("model", help="Model name, alias, or .gguf path")
253+
p_run.add_argument("prompt", nargs="*", default=None, help="Optional prompt")
254+
p_run.add_argument("-j", "--threads", type=int, default=4)
255+
p_run.add_argument("-n", "--max-tokens", type=int, default=256)
256+
p_run.add_argument("-t", "--temperature", type=float, default=0.7)
257+
258+
# serve
259+
p_serve = sub.add_parser("serve", help="Start OpenAI-compatible HTTP server")
260+
p_serve.add_argument("model", help="Model name, alias, or .gguf path")
261+
p_serve.add_argument("-p", "--port", type=int, default=8080)
262+
p_serve.add_argument("-j", "--threads", type=int, default=4)
263+
264+
# Backwards-compat: top-level args for direct chat
265+
parser.add_argument("prompt", nargs="*", default=None,
266+
help="(default mode) question to ask")
267+
parser.add_argument("--model", "-m", default=None,
268+
help="(default mode) model name or .gguf path")
269+
parser.add_argument("--max-tokens", "-n", type=int, default=256)
270+
parser.add_argument("--temperature", "-t", type=float, default=0.7)
271+
parser.add_argument("--threads", "-j", type=int, default=4)
272+
273+
args = parser.parse_args()
274+
275+
if args.command == "pull":
276+
return cmd_pull(args)
277+
if args.command == "list":
278+
return cmd_list(args)
279+
if args.command == "run":
280+
return cmd_run(args)
281+
if args.command == "serve":
282+
return cmd_serve(args)
283+
284+
# No subcommand → backwards-compat default chat
285+
return cmd_chat_default(args)
61286

62287

63288
if __name__ == "__main__":
64-
main()
289+
sys.exit(main())

0 commit comments

Comments
 (0)