1+ // Run:
2+ // .\quadtrix_bench.exe data\input.txt
3+ // .\quadtrix_bench.exe data\input.txt --tokens 100 --runs 10 --warmup 3
4+ //
5+ // Flags (all optional):
6+ // --tokens N tokens to generate per run (default: 50)
7+ // --runs N how many timed runs per prompt (default: 5)
8+ // --warmup N un-timed warmup runs per prompt (default: 2)
9+
10+ #include < iostream>
11+ #include < iomanip>
12+ #include < fstream>
13+ #include < vector>
14+ #include < string>
15+ #include < chrono>
16+ #include < numeric>
17+ #include < cmath>
18+ #include < cstdlib>
19+ #include < algorithm>
20+
21+ #include " config/config.h"
22+ #include " include/dataloader.h"
23+ #include " include/gpt.h"
24+
25+ static bool file_exists (const std::string &p)
26+ {
27+ std::ifstream f (p.c_str (), std::ios::binary);
28+ return f.good ();
29+ }
30+
31+ static double now_ms ()
32+ {
33+ using namespace std ::chrono;
34+ return duration<double , std::milli>(
35+ steady_clock::now ().time_since_epoch ())
36+ .count ();
37+ }
38+
39+ static double mean (const std::vector<double > &v)
40+ {
41+ return std::accumulate (v.begin (), v.end (), 0.0 ) / v.size ();
42+ }
43+
44+ static double stdev (const std::vector<double > &v, double m)
45+ {
46+ double sq = 0.0 ;
47+ for (double x : v)
48+ sq += (x - m) * (x - m);
49+ return std::sqrt (sq / v.size ());
50+ }
51+
52+ static double timed_run (GPTLanguageModel &model,
53+ DataLoader &dl,
54+ const std::vector<int > &prompt_ctx,
55+ int n_tokens)
56+ {
57+ std::vector<int > ctx = prompt_ctx;
58+
59+ double t0 = now_ms ();
60+ for (int i = 0 ; i < n_tokens; ++i)
61+ {
62+ ctx = model.generate (ctx, 1 );
63+ if ((int )ctx.size () > BLOCK_SIZE)
64+ ctx = std::vector<int >(ctx.end () - BLOCK_SIZE, ctx.end ());
65+ }
66+ return now_ms () - t0;
67+ }
68+
69+ //
70+
71+ static void section (const std::string &title)
72+ {
73+ ;
74+ std::cout << " " << title << " \n " ;
75+ }
76+
77+ struct PromptResult
78+ {
79+ std::string label;
80+ int prompt_tokens;
81+ int gen_tokens;
82+ double avg_ms;
83+ double min_ms;
84+ double max_ms;
85+ double std_ms;
86+ double avg_tps; // tokens per second
87+ };
88+
89+ static PromptResult bench_prompt (GPTLanguageModel &model,
90+ DataLoader &dl,
91+ const std::string &prompt,
92+ int n_tokens,
93+ int n_runs,
94+ int n_warmup)
95+ {
96+ // encode
97+ std::vector<int > ctx = dl.encode (prompt);
98+ if (ctx.empty ())
99+ ctx = {0 };
100+ if ((int )ctx.size () > BLOCK_SIZE)
101+ ctx = std::vector<int >(ctx.end () - BLOCK_SIZE, ctx.end ());
102+
103+ int prompt_len = (int )ctx.size ();
104+
105+ // warmup (un-timed)
106+ for (int i = 0 ; i < n_warmup; ++i)
107+ timed_run (model, dl, ctx, n_tokens);
108+
109+ // timed runs
110+ std::vector<double > times;
111+ times.reserve (n_runs);
112+ for (int i = 0 ; i < n_runs; ++i)
113+ times.push_back (timed_run (model, dl, ctx, n_tokens));
114+
115+ double m = mean (times);
116+ double sd = stdev (times, m);
117+ double mn = *std::min_element (times.begin (), times.end ());
118+ double mx = *std::max_element (times.begin (), times.end ());
119+ double tps = n_tokens / (m / 1000.0 );
120+
121+ // truncate prompt for display
122+ std::string label = prompt.size () > 30
123+ ? prompt.substr (0 , 27 ) + " ..."
124+ : prompt;
125+
126+ return PromptResult{label, prompt_len, n_tokens, m, mn, mx, sd, tps};
127+ }
128+
129+ static void print_table (const std::vector<PromptResult> &results)
130+ {
131+ section (" RESULTS" );
132+
133+ // header
134+ std::cout << std::left
135+ << std::setw (34 ) << " Prompt"
136+ << std::right
137+ << std::setw (8 ) << " P.Tok"
138+ << std::setw (8 ) << " G.Tok"
139+ << std::setw (10 ) << " Avg ms"
140+ << std::setw (10 ) << " Min ms"
141+ << std::setw (10 ) << " Max ms"
142+ << std::setw (9 ) << " Std ms"
143+ << std::setw (10 ) << " tok/s"
144+ << " \n " ;
145+ std::cout << std::string (99 , ' -' ) << " \n " ;
146+
147+ std::cout << std::fixed;
148+ for (const auto &r : results)
149+ {
150+ std::cout << std::left
151+ << std::setw (34 ) << r.label
152+ << std::right
153+ << std::setw (8 ) << r.prompt_tokens
154+ << std::setw (8 ) << r.gen_tokens
155+ << std::setw (10 ) << std::setprecision (1 ) << r.avg_ms
156+ << std::setw (10 ) << std::setprecision (1 ) << r.min_ms
157+ << std::setw (10 ) << std::setprecision (1 ) << r.max_ms
158+ << std::setw (9 ) << std::setprecision (1 ) << r.std_ms
159+ << std::setw (10 ) << std::setprecision (2 ) << r.avg_tps
160+ << " \n " ;
161+ }
162+
163+ double total_avg_tps = 0.0 ;
164+ double best_tps = 0.0 ;
165+ for (const auto &r : results)
166+ {
167+ total_avg_tps += r.avg_tps ;
168+ best_tps = std::max (best_tps, r.avg_tps );
169+ }
170+ double overall_tps = total_avg_tps / results.size ();
171+
172+ std::cout << " \n Overall avg throughput : "
173+ << std::setprecision (2 ) << overall_tps << " tok/s\n " ;
174+ std::cout << " Peak throughput : "
175+ << std::setprecision (2 ) << best_tps << " tok/s\n " ;
176+ std::cout << " ms per token (avg) : "
177+ << std::setprecision (2 ) << 1000.0 / overall_tps << " ms\n " ;
178+ }
179+
180+ static void save_csv (const std::vector<PromptResult> &results,
181+ const std::string &path)
182+ {
183+ std::ofstream f (path);
184+ if (!f)
185+ {
186+ std::cerr << " [WARN] Could not write CSV to " << path << " \n " ;
187+ return ;
188+ }
189+ f << " prompt,prompt_tokens,gen_tokens,avg_ms,min_ms,max_ms,std_ms,tok_per_sec\n " ;
190+ for (const auto &r : results)
191+ {
192+ f << " \" " << r.label << " \" ,"
193+ << r.prompt_tokens << " ,"
194+ << r.gen_tokens << " ,"
195+ << r.avg_ms << " ,"
196+ << r.min_ms << " ,"
197+ << r.max_ms << " ,"
198+ << r.std_ms << " ,"
199+ << r.avg_tps << " \n " ;
200+ }
201+ std::cout << " \n CSV saved to: " << path << " \n " ;
202+ }
203+
204+ int main (int argc, char *argv[])
205+ {
206+
207+ std::string data_path = DEFAULT_CLEANED_PATH;
208+ std::string model_path = BEST_MODEL_PATH;
209+ int n_tokens = 50 ;
210+ int n_runs = 5 ;
211+ int n_warmup = 2 ;
212+
213+ for (int i = 1 ; i < argc; ++i)
214+ {
215+ std::string a = argv[i];
216+ if (a == " --tokens" && i + 1 < argc)
217+ n_tokens = std::atoi (argv[++i]);
218+ else if (a == " --runs" && i + 1 < argc)
219+ n_runs = std::atoi (argv[++i]);
220+ else if (a == " --warmup" && i + 1 < argc)
221+ n_warmup = std::atoi (argv[++i]);
222+ else
223+ data_path = a;
224+ }
225+
226+ std::cout << " Quadtrix Inference Benchmark\n " ;
227+ std::cout << " data : " << data_path << " \n " ;
228+ std::cout << " model : " << model_path << " \n " ;
229+ std::cout << " tokens : " << n_tokens << " per run\n " ;
230+ std::cout << " runs : " << n_runs << " timed + "
231+ << n_warmup << " warmup\n " ;
232+
233+ DataLoader dl;
234+ try
235+ {
236+ dl.load (data_path);
237+ }
238+ catch (const std::exception &e)
239+ {
240+ std::cerr << " [ERROR] " << e.what () << " \n " ;
241+ return 1 ;
242+ }
243+
244+ if (!file_exists (model_path))
245+ {
246+ std::cerr << " [ERROR] Weights not found at " << model_path << " \n " ;
247+ std::cerr << " [HINT] Train first, or set " << MODEL_PATH_ENV_VAR << " \n " ;
248+ return 1 ;
249+ }
250+
251+ GPTLanguageModel model (dl.vocab_size , N_EMBD, N_HEAD, N_LAYER, BLOCK_SIZE, SEED);
252+ model.load (model_path);
253+
254+ std::cout << " \n [OK] Model loaded (" << model.num_params () / 1 .0e6f
255+ << " M params)\n " ;
256+
257+ std::vector<std::string> prompts = {
258+ " " ,
259+ " The" , // 1-token prompt
260+ " Once upon a time" , // short prompt
261+ " The quick brown fox jumps" , // medium prompt
262+ std::string (1 , ' a' ), // long prompt (stress-tests context window)
263+ };
264+
265+ section (" RUNNING" );
266+ std::vector<PromptResult> results;
267+ results.reserve (prompts.size ());
268+
269+ for (size_t i = 0 ; i < prompts.size (); ++i)
270+ {
271+ std::string display = prompts[i].empty ()
272+ ? " (empty / BOS)"
273+ : (prompts[i].size () > 30
274+ ? prompts[i].substr (0 , 27 ) + " ..."
275+ : prompts[i]);
276+
277+ std::cout << " [" << (i + 1 ) << " /" << prompts.size () << " ] \" "
278+ << display << " \" ... " << std::flush;
279+
280+ PromptResult r = bench_prompt (model, dl,
281+ prompts[i],
282+ n_tokens, n_runs, n_warmup);
283+ results.push_back (r);
284+
285+ std::cout << std::fixed << std::setprecision (2 )
286+ << r.avg_tps << " tok/s\n " ;
287+ }
288+
289+ print_table (results);
290+ save_csv (results, " benchmark_results.csv" );
291+
292+ std::cout << " \n " ;
293+
294+ std::cout << " Done.\n " ;
295+ return 0 ;
296+ }
0 commit comments