11"""
22quantcpp CLI — chat with a local LLM in your terminal.
33
4- Usage:
5- quantcpp # auto-downloads Llama-3.2-1B, starts chat
6- quantcpp "What is gravity?" # one-shot question
7- quantcpp --model SmolLM2-135M # use a smaller model (faster download)
8- quantcpp --model path/to/file.gguf # use your own GGUF file
4+ Ollama-style commands:
5+ quantcpp pull MODEL Download a model from HuggingFace
6+ quantcpp list List cached and available models
7+ quantcpp run MODEL [Q] Chat with a model (auto-pulls if needed)
8+ quantcpp serve MODEL Start OpenAI-compatible HTTP server
9+
10+ Backwards-compatible shortcut:
11+ quantcpp Auto-downloads Llama-3.2-1B, starts chat
12+ quantcpp "What is X?" One-shot question with default model
13+ quantcpp --model NAME Use a specific model
914"""
1015
1116import sys
1217import os
18+ import json
1319
1420
15- def main ():
16- import argparse
17- parser = argparse .ArgumentParser (
18- prog = "quantcpp" ,
19- description = "Chat with a local LLM. No API key, no GPU, no server." ,
20- )
21- parser .add_argument ("prompt" , nargs = "*" , help = "Question to ask (omit for interactive chat)" )
22- parser .add_argument ("--model" , "-m" , default = "Llama-3.2-1B" ,
23- help = "Model name or path to .gguf file (default: Llama-3.2-1B)" )
24- parser .add_argument ("--max-tokens" , "-n" , type = int , default = 256 )
25- parser .add_argument ("--temperature" , "-t" , type = float , default = 0.7 )
26- args = parser .parse_args ()
21+ # Ollama-style short aliases → canonical _MODEL_REGISTRY keys
22+ MODEL_ALIASES = {
23+ "smollm2" : "SmolLM2-135M" ,
24+ "smollm2:135m" : "SmolLM2-135M" ,
25+ "qwen3.5" : "Qwen3.5-0.8B" ,
26+ "qwen3.5:0.8b" : "Qwen3.5-0.8B" ,
27+ "llama3.2" : "Llama-3.2-1B" ,
28+ "llama3.2:1b" : "Llama-3.2-1B" ,
29+ }
2730
28- from quantcpp import Model
2931
30- # Load model
31- model_path = args .model
32- if os .path .isfile (model_path ):
33- print (f"Loading { model_path } ..." , file = sys .stderr )
34- m = Model (model_path , max_tokens = args .max_tokens , temperature = args .temperature )
35- else :
36- print (f"Downloading { model_path } ..." , file = sys .stderr )
37- m = Model .from_pretrained (model_path , max_tokens = args .max_tokens ,
38- temperature = args .temperature )
32+ def _resolve_name (name ):
33+ """Resolve user input to canonical registry key or local path."""
34+ if name is None :
35+ return None
36+ if os .path .exists (name ) and name .endswith (".gguf" ):
37+ return name
38+ return MODEL_ALIASES .get (name .lower (), name )
39+
40+
41+ def _registry ():
42+ from quantcpp import _MODEL_REGISTRY , _CACHE_DIR
43+ return _MODEL_REGISTRY , _CACHE_DIR
44+
45+
46+ def cmd_pull (args ):
47+ """Download a model by alias or canonical name."""
48+ import quantcpp
49+ name = _resolve_name (args .model )
50+
51+ if os .path .exists (name ) and name .endswith (".gguf" ):
52+ print (f"already local: { name } " )
53+ return 0
54+
55+ if name not in quantcpp ._MODEL_REGISTRY :
56+ avail = ", " .join (sorted (quantcpp ._MODEL_REGISTRY .keys ()))
57+ aliases = ", " .join (sorted (MODEL_ALIASES .keys ()))
58+ print (f"unknown model: { args .model !r} " , file = sys .stderr )
59+ print (f" registry: { avail } " , file = sys .stderr )
60+ print (f" aliases: { aliases } " , file = sys .stderr )
61+ return 1
62+
63+ print (f"pulling { name } ..." , file = sys .stderr )
64+ try :
65+ path = quantcpp .download (name )
66+ size_mb = os .path .getsize (path ) / (1024 * 1024 )
67+ print (f"\u2713 { name } \u2192 { path } ({ size_mb :.0f} MB)" , file = sys .stderr )
68+ return 0
69+ except Exception as e :
70+ print (f"download failed: { e } " , file = sys .stderr )
71+ return 1
72+
73+
74+ def cmd_list (args ):
75+ """List cached and available models."""
76+ registry , cache_dir = _registry ()
77+
78+ rows = []
79+ for name , (repo , filename , approx_mb ) in sorted (registry .items ()):
80+ path = cache_dir / filename
81+ if path .exists ():
82+ size_mb = path .stat ().st_size / (1024 * 1024 )
83+ status = "cached"
84+ display_path = str (path )
85+ else :
86+ size_mb = approx_mb
87+ status = "remote"
88+ display_path = f"~{ approx_mb } MB"
89+ alias = next ((a for a , n in MODEL_ALIASES .items () if n == name and ":" in a ), "" )
90+ rows .append ((status , name , alias , size_mb , display_path ))
91+
92+ if args .json_output :
93+ print (json .dumps ([
94+ {"status" : s , "name" : n , "alias" : a , "size_mb" : round (sz , 1 ), "path" : p }
95+ for (s , n , a , sz , p ) in rows
96+ ], indent = 2 ))
97+ return 0
98+
99+ print (f"\n Models cache: { cache_dir } \n " )
100+ print (f" { 'STATUS' :<8} { 'NAME' :<16} { 'ALIAS' :<14} { 'SIZE' :>8} " )
101+ print (f" { '-' * 8 } { '-' * 16 } { '-' * 14 } { '-' * 8 } " )
102+ for status , name , alias , size_mb , _ in rows :
103+ size_str = f"{ size_mb :.0f} MB"
104+ print (f" { status :<8} { name :<16} { alias :<14} { size_str :>8} " )
105+ print ()
106+ return 0
107+
108+
109+ def _resolve_to_path (name_or_path ):
110+ """Resolve alias/name to a local .gguf path, downloading if needed."""
111+ import quantcpp
112+ name = _resolve_name (name_or_path )
113+
114+ if os .path .exists (name ) and name .endswith (".gguf" ):
115+ return name
116+
117+ if name not in quantcpp ._MODEL_REGISTRY :
118+ avail = ", " .join (sorted (quantcpp ._MODEL_REGISTRY .keys ()))
119+ raise ValueError (
120+ f"unknown model: { name_or_path !r} . Available: { avail } "
121+ )
122+
123+ repo , filename , _ = quantcpp ._MODEL_REGISTRY [name ]
124+ cached = quantcpp ._CACHE_DIR / filename
125+ if cached .exists ():
126+ return str (cached )
127+
128+ print (f"model not cached \u2014 pulling { name } ..." , file = sys .stderr )
129+ return quantcpp .download (name )
130+
131+
132+ def cmd_run (args ):
133+ """Chat with a model (auto-pull if needed)."""
134+ try :
135+ model_path = _resolve_to_path (args .model )
136+ except ValueError as e :
137+ print (str (e ), file = sys .stderr )
138+ return 1
139+ except Exception as e :
140+ print (f"pull failed: { e } " , file = sys .stderr )
141+ return 1
142+
143+ from quantcpp import Model
144+ print (f"loading { os .path .basename (model_path )} ..." , file = sys .stderr )
145+ m = Model (model_path , max_tokens = args .max_tokens , temperature = args .temperature ,
146+ n_threads = args .threads )
39147
40- # One-shot or interactive
41148 if args .prompt :
42- question = " " .join (args .prompt )
149+ question = " " .join (args .prompt ) if isinstance ( args . prompt , list ) else args . prompt
43150 for tok in m .generate (question ):
44151 print (tok , end = "" , flush = True )
45152 print ()
46153 else :
47- print ("quantcpp — type your message, Ctrl+C to exit" , file = sys .stderr )
154+ print ("quantcpp \u2014 type your message, Ctrl+C to exit" , file = sys .stderr )
48155 try :
49156 while True :
50157 question = input ("\n You: " )
@@ -58,7 +165,125 @@ def main():
58165 print ("\n Bye!" , file = sys .stderr )
59166
60167 m .close ()
168+ return 0
169+
170+
171+ def cmd_serve (args ):
172+ """Start OpenAI-compatible HTTP server (requires quant-server binary)."""
173+ import shutil
174+ import subprocess
175+
176+ try :
177+ model_path = _resolve_to_path (args .model )
178+ except Exception as e :
179+ print (f"error: { e } " , file = sys .stderr )
180+ return 1
181+
182+ binary = shutil .which ("quant-server" )
183+ if not binary :
184+ # Look in common build dirs relative to repo
185+ for guess in ("./build/quant-server" , "./build_metal/quant-server" ):
186+ if os .path .isfile (guess ) and os .access (guess , os .X_OK ):
187+ binary = guess
188+ break
189+
190+ if not binary :
191+ print ("quant-server binary not found." , file = sys .stderr )
192+ print (" Build with: cmake -B build -DTQ_BUILD_SERVER=ON && cmake --build build" ,
193+ file = sys .stderr )
194+ print (" Or install via your package manager." , file = sys .stderr )
195+ return 2
196+
197+ cmd = [binary , model_path , "-p" , str (args .port ), "-j" , str (args .threads )]
198+ print (f"quant serve { os .path .basename (model_path )} on :{ args .port } " , file = sys .stderr )
199+ os .execvp (cmd [0 ], cmd )
200+
201+
202+ def cmd_chat_default (args ):
203+ """Backwards-compatible default: auto-download Llama-3.2-1B and chat."""
204+ args .model = args .model or "Llama-3.2-1B"
205+ args .threads = getattr (args , "threads" , 4 )
206+ args .max_tokens = getattr (args , "max_tokens" , 256 )
207+ args .temperature = getattr (args , "temperature" , 0.7 )
208+ args .prompt = args .prompt or None
209+ return cmd_run (args )
210+
211+
212+ def main ():
213+ import argparse
214+
215+ parser = argparse .ArgumentParser (
216+ prog = "quantcpp" ,
217+ description = "Chat with a local LLM. No API key, no GPU, no server." ,
218+ formatter_class = argparse .RawDescriptionHelpFormatter ,
219+ epilog = """
220+ commands:
221+ pull MODEL Download a model (e.g. llama3.2:1b)
222+ list List cached and available models
223+ run MODEL [PROMPT] Chat with a model (auto-pulls if needed)
224+ serve MODEL Start OpenAI-compatible HTTP server
225+
226+ examples:
227+ quantcpp pull llama3.2:1b
228+ quantcpp list
229+ quantcpp run llama3.2:1b
230+ quantcpp run llama3.2:1b "What is gravity?"
231+ quantcpp serve llama3.2:1b --port 8080
232+
233+ backwards-compat (no subcommand):
234+ quantcpp # default chat with Llama-3.2-1B
235+ quantcpp "What is gravity?" # one-shot
236+ quantcpp --model SmolLM2-135M # different model
237+ """ ,
238+ )
239+
240+ sub = parser .add_subparsers (dest = "command" )
241+
242+ # pull
243+ p_pull = sub .add_parser ("pull" , help = "Download a model from HuggingFace" )
244+ p_pull .add_argument ("model" , help = "Model name or alias (e.g. llama3.2:1b)" )
245+
246+ # list
247+ p_list = sub .add_parser ("list" , help = "List cached and available models" )
248+ p_list .add_argument ("--json" , dest = "json_output" , action = "store_true" )
249+
250+ # run
251+ p_run = sub .add_parser ("run" , help = "Chat with a model (auto-pulls if needed)" )
252+ p_run .add_argument ("model" , help = "Model name, alias, or .gguf path" )
253+ p_run .add_argument ("prompt" , nargs = "*" , default = None , help = "Optional prompt" )
254+ p_run .add_argument ("-j" , "--threads" , type = int , default = 4 )
255+ p_run .add_argument ("-n" , "--max-tokens" , type = int , default = 256 )
256+ p_run .add_argument ("-t" , "--temperature" , type = float , default = 0.7 )
257+
258+ # serve
259+ p_serve = sub .add_parser ("serve" , help = "Start OpenAI-compatible HTTP server" )
260+ p_serve .add_argument ("model" , help = "Model name, alias, or .gguf path" )
261+ p_serve .add_argument ("-p" , "--port" , type = int , default = 8080 )
262+ p_serve .add_argument ("-j" , "--threads" , type = int , default = 4 )
263+
264+ # Backwards-compat: top-level args for direct chat
265+ parser .add_argument ("prompt" , nargs = "*" , default = None ,
266+ help = "(default mode) question to ask" )
267+ parser .add_argument ("--model" , "-m" , default = None ,
268+ help = "(default mode) model name or .gguf path" )
269+ parser .add_argument ("--max-tokens" , "-n" , type = int , default = 256 )
270+ parser .add_argument ("--temperature" , "-t" , type = float , default = 0.7 )
271+ parser .add_argument ("--threads" , "-j" , type = int , default = 4 )
272+
273+ args = parser .parse_args ()
274+
275+ if args .command == "pull" :
276+ return cmd_pull (args )
277+ if args .command == "list" :
278+ return cmd_list (args )
279+ if args .command == "run" :
280+ return cmd_run (args )
281+ if args .command == "serve" :
282+ return cmd_serve (args )
283+
284+ # No subcommand → backwards-compat default chat
285+ return cmd_chat_default (args )
61286
62287
63288if __name__ == "__main__" :
64- main ()
289+ sys . exit ( main () )
0 commit comments